From 6423b783be0064f8508978dce4602bd4fb96a4de Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 12:55:09 +0300
Subject: [PATCH 001/214] [smgr_api] [community] smgr_api.patch

Make smgr API pluggable. Add smgr_hook that can be used to define custom smgrs.
Remove smgrsw[] array and smgr_sw selector. Instead, smgropen() loads
f_smgr implementation using smgr_hook.

Also add smgr_init_hook and smgr_shutdown_hook.
And a lot of mechanical changes in smgr.c functions.

This patch is proposed to community: https://commitfest.postgresql.org/33/3216/

Author: anastasia <lubennikovaav@gmail.com>
---
 src/backend/storage/smgr/smgr.c | 159 +++++++++++++++-----------------
 src/include/storage/smgr.h      |  56 ++++++++++-
 2 files changed, 131 insertions(+), 84 deletions(-)

diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4dc24649df9..b455d07edce 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -18,6 +18,7 @@
 #include "postgres.h"
 
 #include "access/xlog.h"
+#include "catalog/pg_tablespace.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
@@ -26,47 +27,8 @@
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
-
-/*
- * This struct of function pointers defines the API between smgr.c and
- * any individual storage manager module.  Note that smgr subfunctions are
- * generally expected to report problems via elog(ERROR).  An exception is
- * that smgr_unlink should use elog(WARNING), rather than erroring out,
- * because we normally unlink relations during post-commit/abort cleanup,
- * and so it's too late to raise an error.  Also, various conditions that
- * would normally be errors should be allowed during bootstrap and/or WAL
- * recovery --- see comments in md.c for details.
- */
-typedef struct f_smgr
-{
-	void		(*smgr_init) (void);	/* may be NULL */
-	void		(*smgr_shutdown) (void);	/* may be NULL */
-	void		(*smgr_open) (SMgrRelation reln);
-	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
-								bool isRedo);
-	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
-								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, char *buffer, bool skipFsync);
-	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
-								  BlockNumber blocknum);
-	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
-							  BlockNumber blocknum, char *buffer);
-	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
-							   BlockNumber blocknum, char *buffer, bool skipFsync);
-	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
-								   BlockNumber blocknum, BlockNumber nblocks);
-	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
-								  BlockNumber nblocks);
-	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-} f_smgr;
-
-static const f_smgr smgrsw[] = {
+static const f_smgr smgr_md = {
 	/* magnetic disk */
-	{
 		.smgr_init = mdinit,
 		.smgr_shutdown = NULL,
 		.smgr_open = mdopen,
@@ -82,11 +44,8 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
-	}
 };
 
-static const int NSmgr = lengthof(smgrsw);
-
 /*
  * Each backend has a hashtable that stores all extant SMgrRelation objects.
  * In addition, "unowned" SMgrRelation objects are chained together in a list.
@@ -96,7 +55,7 @@ static HTAB *SMgrRelationHash = NULL;
 static dlist_head unowned_relns;
 
 /* local function prototypes */
-static void smgrshutdown(int code, Datum arg);
+//static void smgrshutdown(int code, Datum arg);
 
 
 /*
@@ -110,33 +69,71 @@ static void smgrshutdown(int code, Datum arg);
 void
 smgrinit(void)
 {
-	int			i;
+	if (smgr_init_hook)
+		(*smgr_init_hook)();
 
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_init)
-			smgrsw[i].smgr_init();
-	}
+	smgr_init_standard();
 
-	/* register the shutdown proc */
-	on_proc_exit(smgrshutdown, 0);
+	/*
+	 * ZENITH XXX
+	 * This doesn't work with inmem_smgr, so temporarily disable.
+	 * Anyway, we don't have any real smgrshutdown function.
+	 */
+	// /* register the shutdown proc */
+	// on_proc_exit(smgrshutdown, 0);
 }
 
-/*
- * on_proc_exit hook for smgr cleanup during backend shutdown
- */
-static void
-smgrshutdown(int code, Datum arg)
+//ZENITH XXX See comment above. Silence compiler warning.
+// /*
+//  * on_proc_exit hook for smgr cleanup during backend shutdown
+//  */
+// static void
+// smgrshutdown(int code, Datum arg)
+// {
+// 	if (smgr_shutdown_hook)
+// 		(*smgr_shutdown_hook)();
+
+// 	smgr_shutdown_standard();
+// }
+
+/* Hook for plugins to get control in smgr */
+smgr_hook_type smgr_hook = NULL;
+smgr_init_hook_type smgr_init_hook = NULL;
+smgr_shutdown_hook_type smgr_shutdown_hook = NULL;
+
+const f_smgr *
+smgr_standard(BackendId backend, RelFileNode rnode)
 {
-	int			i;
+	return &smgr_md;
+}
 
-	for (i = 0; i < NSmgr; i++)
+void
+smgr_init_standard(void)
+{
+	mdinit();
+}
+
+void
+smgr_shutdown_standard(void)
+{
+}
+
+const f_smgr *
+smgr(BackendId backend, RelFileNode rnode)
+{
+	const f_smgr *result;
+
+	if (smgr_hook)
 	{
-		if (smgrsw[i].smgr_shutdown)
-			smgrsw[i].smgr_shutdown();
+		result = (*smgr_hook)(backend, rnode);
 	}
+	else
+		result = smgr_standard(backend, rnode);
+
+	return result;
 }
 
+
 /*
  *	smgropen() -- Return an SMgrRelation object, creating it if need be.
  *
@@ -176,10 +173,11 @@ smgropen(RelFileNode rnode, BackendId backend)
 		reln->smgr_targblock = InvalidBlockNumber;
 		for (int i = 0; i <= MAX_FORKNUM; ++i)
 			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
-		reln->smgr_which = 0;	/* we only have md.c at present */
+
+		reln->smgr = smgr(backend, rnode);
 
 		/* implementation-specific initialization */
-		smgrsw[reln->smgr_which].smgr_open(reln);
+		(*reln->smgr).smgr_open(reln);
 
 		/* it has no owner yet */
 		dlist_push_tail(&unowned_relns, &reln->node);
@@ -246,7 +244,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
 bool
 smgrexists(SMgrRelation reln, ForkNumber forknum)
 {
-	return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
+	return (*reln->smgr).smgr_exists(reln, forknum);
 }
 
 /*
@@ -259,7 +257,7 @@ smgrclose(SMgrRelation reln)
 	ForkNumber	forknum;
 
 	for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+		(*reln->smgr).smgr_close(reln, forknum);
 
 	owner = reln->smgr_owner;
 
@@ -332,7 +330,7 @@ smgrclosenode(RelFileNodeBackend rnode)
 void
 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 {
-	smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
+	(*reln->smgr).smgr_create(reln, forknum, isRedo);
 }
 
 /*
@@ -360,12 +358,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels)
 	 */
 	for (i = 0; i < nrels; i++)
 	{
-		int			which = rels[i]->smgr_which;
-
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
-			if (smgrsw[which].smgr_exists(rels[i], forknum))
-				smgrsw[which].smgr_immedsync(rels[i], forknum);
+			if ((*rels[i]->smgr).smgr_exists(rels[i], forknum))
+				(*rels[i]->smgr).smgr_immedsync(rels[i], forknum);
 		}
 	}
 }
@@ -404,13 +400,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 	for (i = 0; i < nrels; i++)
 	{
 		RelFileNodeBackend rnode = rels[i]->smgr_rnode;
-		int			which = rels[i]->smgr_which;
 
 		rnodes[i] = rnode;
 
 		/* Close the forks at smgr level */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			smgrsw[which].smgr_close(rels[i], forknum);
+			(*rels[i]->smgr).smgr_close(rels[i], forknum);
 	}
 
 	/*
@@ -439,10 +434,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 
 	for (i = 0; i < nrels; i++)
 	{
-		int			which = rels[i]->smgr_which;
-
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo);
+			(*rels[i]->smgr).smgr_unlink(rnodes[i], forknum, isRedo);
 	}
 
 	pfree(rnodes);
@@ -462,7 +455,7 @@ void
 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   char *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
+	(*reln->smgr).smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
 
 	/*
@@ -486,7 +479,7 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 bool
 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+	return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum);
 }
 
 /*
@@ -501,7 +494,7 @@ void
 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 char *buffer)
 {
-	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+	(*reln->smgr).smgr_read(reln, forknum, blocknum, buffer);
 }
 
 /*
@@ -523,7 +516,7 @@ void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		  char *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
+	(*reln->smgr).smgr_write(reln, forknum, blocknum,
 										buffer, skipFsync);
 }
 
@@ -536,7 +529,7 @@ void
 smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			  BlockNumber nblocks)
 {
-	smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
+	(*reln->smgr).smgr_writeback(reln, forknum, blocknum,
 											nblocks);
 }
 
@@ -554,7 +547,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 	if (result != InvalidBlockNumber)
 		return result;
 
-	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+	result = (*reln->smgr).smgr_nblocks(reln, forknum);
 
 	reln->smgr_cached_nblocks[forknum] = result;
 
@@ -620,7 +613,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 		/* Make the cached size is invalid if we encounter an error. */
 		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
 
-		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+		(*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]);
 
 		/*
 		 * We might as well update the local smgr_cached_nblocks values. The
@@ -659,7 +652,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 void
 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
+	(*reln->smgr).smgr_immedsync(reln, forknum);
 }
 
 /*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a6fbf7b6a6c..a7c98c7e7fe 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,8 @@
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+struct f_smgr;
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)
@@ -59,7 +61,7 @@ typedef struct SMgrRelationData
 	 * Fields below here are intended to be private to smgr.c and its
 	 * submodules.  Do not touch them from elsewhere.
 	 */
-	int			smgr_which;		/* storage manager selector */
+	const struct f_smgr *smgr;
 
 	/*
 	 * for md.c; per-fork arrays of the number of open segments
@@ -77,6 +79,58 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
 
+
+/*
+ * This struct of function pointers defines the API between smgr.c and
+ * any individual storage manager module.  Note that smgr subfunctions are
+ * generally expected to report problems via elog(ERROR).  An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error.  Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
+ */
+typedef struct f_smgr
+{
+	void		(*smgr_init) (void);	/* may be NULL */
+	void		(*smgr_shutdown) (void);	/* may be NULL */
+	void		(*smgr_open) (SMgrRelation reln);
+	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
+								bool isRedo);
+	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
+								bool isRedo);
+	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+								BlockNumber blocknum, char *buffer, bool skipFsync);
+	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber blocknum);
+	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
+							  BlockNumber blocknum, char *buffer);
+	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
+							   BlockNumber blocknum, char *buffer, bool skipFsync);
+	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+								   BlockNumber blocknum, BlockNumber nblocks);
+	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber nblocks);
+	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+} f_smgr;
+
+typedef void (*smgr_init_hook_type) (void);
+typedef void (*smgr_shutdown_hook_type) (void);
+extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook;
+extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook;
+extern void smgr_init_standard(void);
+extern void smgr_shutdown_standard(void);
+
+
+typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode);
+extern PGDLLIMPORT smgr_hook_type smgr_hook;
+extern const f_smgr *smgr_standard(BackendId backend, RelFileNode rnode);
+
+extern const f_smgr *smgr(BackendId backend, RelFileNode rnode);
+
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);

From 6140082d022ada93e957a53adf4a35fcc0563f30 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 12:58:03 +0300
Subject: [PATCH 002/214] [contrib/zenith] contrib_zenith.patch

Add contrib/zenith that handles interaction with remote pagestore.
To use it add 'shared_preload_library = zenith' to postgresql.conf.

It adds a protocol for network communications - see libpagestore.c;
and implements smgr API.

Also it adds several custom GUC variables:
- zenith.page_server_connstring
- zenith.callmemaybe_connstring
- zenith.zenith_timeline
- zenith.wal_redo

Authors:
Stas Kelvich <stanconn@gmail.com>
Konstantin Knizhnik <knizhnik@garret.ru>
Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 contrib/zenith/Makefile           |  25 +
 contrib/zenith/inmem_smgr.c       | 298 ++++++++++
 contrib/zenith/libpagestore.c     | 258 +++++++++
 contrib/zenith/pagestore_client.h | 151 +++++
 contrib/zenith/pagestore_smgr.c   | 930 ++++++++++++++++++++++++++++++
 contrib/zenith/zenith.control     |   4 +
 6 files changed, 1666 insertions(+)
 create mode 100644 contrib/zenith/Makefile
 create mode 100644 contrib/zenith/inmem_smgr.c
 create mode 100644 contrib/zenith/libpagestore.c
 create mode 100644 contrib/zenith/pagestore_client.h
 create mode 100644 contrib/zenith/pagestore_smgr.c
 create mode 100644 contrib/zenith/zenith.control

diff --git a/contrib/zenith/Makefile b/contrib/zenith/Makefile
new file mode 100644
index 00000000000..ad41c55bd71
--- /dev/null
+++ b/contrib/zenith/Makefile
@@ -0,0 +1,25 @@
+# contrib/zenith/Makefile
+
+
+MODULE_big = zenith
+OBJS = \
+	$(WIN32RES) \
+	inmem_smgr.o libpagestore.o pagestore_smgr.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK_INTERNAL = $(libpq)
+
+EXTENSION = zenith
+PGFILEDESC = "zenith - cloud storage for PostgreSQL"
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+SHLIB_PREREQS = submake-libpq
+subdir = contrib/zenith
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
new file mode 100644
index 00000000000..6ad1e65b04a
--- /dev/null
+++ b/contrib/zenith/inmem_smgr.c
@@ -0,0 +1,298 @@
+/*-------------------------------------------------------------------------
+ *
+ * inmem_smgr.c
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  contrib/zenith/inmem_smgr.c
+ *
+ * TODO cleanup obsolete copy-pasted comments
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "pagestore_client.h"
+#include "utils/hsearch.h"
+#include "access/xlog.h"
+
+typedef struct
+{
+	RelFileNode node;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+}			WrNodeKey;
+
+typedef struct
+{
+	WrNodeKey	tag;
+	char		data[BLCKSZ];
+}			WrNode;
+
+HTAB	   *inmem_files;
+
+/*
+ *	inmem_init() -- Initialize private state
+ */
+void
+inmem_init(void)
+{
+	HASHCTL		hashCtl;
+
+	hashCtl.keysize = sizeof(WrNodeKey);
+	hashCtl.entrysize = sizeof(WrNode);
+
+	if (inmem_files)
+		hash_destroy(inmem_files);
+
+	inmem_files = hash_create("wal-redo files map",
+							  1024,
+							  &hashCtl,
+							  HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ *	inmem_exists() -- Does the physical file exist?
+ */
+bool
+inmem_exists(SMgrRelation reln, ForkNumber forknum)
+{
+	WrNodeKey	key;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = 0;
+	return hash_search(inmem_files,
+					   &key,
+					   HASH_FIND,
+					   NULL) != NULL;
+}
+
+/*
+ *	inmem_create() -- Create a new relation on zenithd storage
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+}
+
+/*
+ *	inmem_unlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
+{
+}
+
+/*
+ *	inmem_extend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 char *buffer, bool skipFsync)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = blkno;
+	node = hash_search(inmem_files,
+					   &key,
+					   HASH_ENTER,
+					   NULL);
+	memcpy(node->data, buffer, BLCKSZ);
+}
+
+/*
+ *  inmem_open() -- Initialize newly-opened relation.
+ */
+void
+inmem_open(SMgrRelation reln)
+{
+}
+
+/*
+ *	inmem_close() -- Close the specified relation, if it isn't closed already.
+ */
+void
+inmem_close(SMgrRelation reln, ForkNumber forknum)
+{
+}
+
+/*
+ *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	return true;
+}
+
+/*
+ * inmem_writeback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+				BlockNumber blocknum, BlockNumber nblocks)
+{
+}
+
+/*
+ *	inmem_read() -- Read the specified block from a relation.
+ */
+void
+inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+		   char *buffer)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = blkno;
+	node = hash_search(inmem_files,
+					   &key,
+					   HASH_FIND,
+					   NULL);
+	if (node != NULL)
+		memcpy(buffer, node->data, BLCKSZ);
+	else
+		memset(buffer, 0, BLCKSZ);
+}
+
+/*
+ *	inmem_write() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			char *buffer, bool skipFsync)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = blocknum;
+	node = hash_search(inmem_files,
+					   &key,
+					   HASH_ENTER,
+					   NULL);
+	memcpy(node->data, buffer, BLCKSZ);
+}
+
+/*
+ *	inmem_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = 0;
+
+	while (true)
+	{
+		node = hash_search(inmem_files,
+						   &key,
+						   HASH_FIND,
+						   NULL);
+		if (node == NULL)
+			return key.blkno;
+		key.blkno += 1;
+	}
+}
+
+/*
+ *	inmem_truncate() -- Truncate relation to specified number of blocks.
+ */
+void
+inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+}
+
+/*
+ *	inmem_immedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+}
+static const struct f_smgr inmem_smgr =
+{
+	.smgr_init = inmem_init,
+	.smgr_shutdown = NULL,
+	.smgr_open = inmem_open,
+	.smgr_close = inmem_close,
+	.smgr_create = inmem_create,
+	.smgr_exists = inmem_exists,
+	.smgr_unlink = inmem_unlink,
+	.smgr_extend = inmem_extend,
+	.smgr_prefetch = inmem_prefetch,
+	.smgr_read = inmem_read,
+	.smgr_write = inmem_write,
+	.smgr_writeback = inmem_writeback,
+	.smgr_nblocks = inmem_nblocks,
+	.smgr_truncate = inmem_truncate,
+	.smgr_immedsync = inmem_immedsync,
+};
+
+const f_smgr *
+smgr_inmem(BackendId backend, RelFileNode rnode)
+{
+	if (backend != InvalidBackendId && !InRecovery)
+		return smgr_standard(backend, rnode);
+	else
+	{
+		return &inmem_smgr;
+	}
+}
+
+void
+smgr_init_inmem()
+{
+	inmem_init();
+}
diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
new file mode 100644
index 00000000000..062f0cbf2e0
--- /dev/null
+++ b/contrib/zenith/libpagestore.c
@@ -0,0 +1,258 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpqpagestore.c
+ *	  Handles network communications with the remote pagestore.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith/libpqpagestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pagestore_client.h"
+#include "fmgr.h"
+#include "access/xlog.h"
+
+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "libpq/libpq.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/guc.h"
+
+#include "replication/walproposer.h"
+
+PG_MODULE_MAGIC;
+
+void		_PG_init(void);
+
+#define PqPageStoreTrace DEBUG5
+
+#define ZENITH_TAG "[ZENITH_SMGR] "
+#define zenith_log(tag, fmt, ...) ereport(tag, \
+		(errmsg(ZENITH_TAG fmt, ## __VA_ARGS__), \
+		 errhidestmt(true), errhidecontext(true)))
+
+bool		connected = false;
+PGconn	   *pageserver_conn;
+
+static ZenithResponse * zenith_call(ZenithRequest request);
+page_server_api api = {
+	.request = zenith_call
+};
+
+static void
+zenith_connect()
+{
+	char	   *query;
+	int			ret;
+
+	pageserver_conn = PQconnectdb(page_server_connstring);
+
+	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		PQfinish(pageserver_conn);
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("[ZENITH_SMGR] could not establish connection"),
+				 errdetail_internal("%s", msg)));
+	}
+
+	/* Ask the Page Server to connect to us, and stream WAL from us. */
+	if (callmemaybe_connstring && callmemaybe_connstring[0])
+	{
+		PGresult   *res;
+
+		query = psprintf("callmemaybe %s %s", zenith_timeline, callmemaybe_connstring);
+		res = PQexec(pageserver_conn, query);
+		if (PQresultStatus(res) != PGRES_COMMAND_OK)
+		{
+			zenith_log(ERROR,
+					   "[ZENITH_SMGR] callmemaybe command failed");
+		}
+		PQclear(res);
+	}
+
+	query = psprintf("pagestream %s", zenith_timeline);
+	ret = PQsendQuery(pageserver_conn, query);
+	if (ret != 1)
+		zenith_log(ERROR,
+				   "[ZENITH_SMGR] failed to start dispatcher_loop on pageserver");
+
+	while (PQisBusy(pageserver_conn))
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(pageserver_conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(pageserver_conn))
+				zenith_log(ERROR, "[ZENITH_SMGR] failed to get handshake from pageserver: %s",
+						   PQerrorMessage(pageserver_conn));
+		}
+	}
+
+	zenith_log(LOG, "libpqpagestore: connected to '%s'", page_server_connstring);
+
+	connected = true;
+}
+
+
+static ZenithResponse *
+zenith_call(ZenithRequest request)
+{
+	StringInfoData req_buff;
+	StringInfoData resp_buff;
+	ZenithMessage *resp;
+
+	/* If the connection was lost for some reason, reconnect */
+	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
+		connected = false;
+	}
+
+	if (!connected)
+		zenith_connect();
+
+	req_buff = zm_pack((ZenithMessage *) & request);
+
+	/* send request */
+	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
+	{
+		zenith_log(ERROR, "failed to send page request: %s",
+				   PQerrorMessage(pageserver_conn));
+	}
+	pfree(req_buff.data);
+
+	{
+		char	   *msg = zm_to_string((ZenithMessage *) & request);
+
+		zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
+		pfree(msg);
+	}
+
+	/* read response */
+	resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
+	resp_buff.cursor = 0;
+
+	if (resp_buff.len == -1)
+		zenith_log(ERROR, "end of COPY");
+	else if (resp_buff.len == -2)
+		zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+
+	resp = zm_unpack(&resp_buff);
+	PQfreemem(resp_buff.data);
+
+	Assert(messageTag(resp) == T_ZenithStatusResponse
+		   || messageTag(resp) == T_ZenithNblocksResponse
+		   || messageTag(resp) == T_ZenithReadResponse);
+
+	{
+		char	   *msg = zm_to_string((ZenithMessage *) & request);
+
+		zenith_log(PqPageStoreTrace, "Got response: %s", msg);
+		pfree(msg);
+	}
+
+
+	/*
+	 * XXX: zm_to_string leak strings. Check with what memory contex all this
+	 * methods are called.
+	 */
+
+	return (ZenithResponse *) resp;
+}
+
+
+static bool
+check_zenith_timeline(char **newval, void **extra, GucSource source)
+{
+	uint8		ztimelineid[16];
+
+	return **newval == '\0' || HexDecodeString(ztimelineid, *newval, 16);
+}
+
+/*
+ * Module initialization function
+ */
+void
+_PG_init(void)
+{
+	DefineCustomStringVariable("zenith.page_server_connstring",
+							   "connection string to the page server",
+							   NULL,
+							   &page_server_connstring,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   NULL, NULL, NULL);
+
+	DefineCustomStringVariable("zenith.callmemaybe_connstring",
+							   "Connection string that Page Server or WAL safekeeper should use to connect to us",
+							   NULL,
+							   &callmemaybe_connstring,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   NULL, NULL, NULL);
+
+	DefineCustomStringVariable("zenith.zenith_timeline",
+							   "Zenith timelineid the server is running on",
+							   NULL,
+							   &zenith_timeline,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_zenith_timeline, NULL, NULL);
+
+	DefineCustomBoolVariable("zenith.wal_redo",
+							 "start in wal-redo mode",
+							 NULL,
+							 &wal_redo,
+							 false,
+							 PGC_POSTMASTER,
+							 0,
+							 NULL, NULL, NULL);
+
+	if (page_server != NULL)
+		zenith_log(ERROR, "libpqpagestore already loaded");
+
+	zenith_log(PqPageStoreTrace, "libpqpagestore already loaded");
+	page_server = &api;
+
+	/* Is there more correct way to pass CustomGUC to postgres code? */
+	zenith_timeline_walproposer = zenith_timeline;
+
+	if (wal_redo)
+	{
+		zenith_log(PqPageStoreTrace, "set inmem_smgr hook");
+		smgr_hook = smgr_inmem;
+		smgr_init_hook = smgr_init_inmem;
+	}
+	else if (page_server_connstring && page_server_connstring[0])
+	{
+		zenith_log(PqPageStoreTrace, "set zenith_smgr hook");
+		smgr_hook = smgr_zenith;
+		smgr_init_hook = smgr_init_zenith;
+	}
+}
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
new file mode 100644
index 00000000000..400fb259a6b
--- /dev/null
+++ b/contrib/zenith/pagestore_client.h
@@ -0,0 +1,151 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagestore_client.h
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * contrib/zenith/pagestore_client.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef pageserver_h
+#define pageserver_h
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "storage/relfilenode.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "utils/memutils.h"
+
+#include "pg_config.h"
+
+typedef enum
+{
+	/* pagestore_client -> pagestore */
+	T_ZenithExistsRequest = 0,
+	T_ZenithNblocksRequest,
+	T_ZenithReadRequest,
+
+	/* pagestore -> pagestore_client */
+	T_ZenithStatusResponse = 100,
+	T_ZenithNblocksResponse,
+	T_ZenithReadResponse,
+}			ZenithMessageTag;
+
+
+/* base struct for c-style inheritance */
+typedef struct
+{
+	ZenithMessageTag tag;
+}			ZenithMessage;
+
+#define messageTag(m)		(((const ZenithMessage *)(m))->tag)
+
+extern char const *const ZenithMessageStr[];
+
+typedef struct
+{
+	RelFileNode rnode;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+}			PageKey;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	uint64		system_id;
+	PageKey		page_key;
+	XLogRecPtr	lsn;			/* request page version @ this LSN */
+}			ZenithRequest;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		ok;
+	uint32		n_blocks;
+	char		page[1];
+}			ZenithResponse;
+
+StringInfoData zm_pack(ZenithMessage * msg);
+ZenithMessage *zm_unpack(StringInfo s);
+char	   *zm_to_string(ZenithMessage * msg);
+
+/*
+ * API
+ */
+
+typedef struct
+{
+	ZenithResponse *(*request) (ZenithRequest request);
+}			page_server_api;
+
+extern page_server_api * page_server;
+
+extern char *page_server_connstring;
+extern char *callmemaybe_connstring;
+extern char *zenith_timeline;
+extern bool wal_redo;
+
+extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
+extern void smgr_init_zenith(void);
+
+extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern void smgr_init_inmem(void);
+extern void smgr_shutdown_inmem(void);
+
+/* zenith storage manager functionality */
+
+extern void zenith_init(void);
+extern void zenith_open(SMgrRelation reln);
+extern void zenith_close(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void zenith_extend(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum);
+extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+						char *buffer);
+extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
+							 BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber nblocks);
+extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+extern bool zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum);
+extern void zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum);
+
+/* zenith wal-redo storage manager functionality */
+
+extern void inmem_init(void);
+extern void inmem_open(SMgrRelation reln);
+extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum);
+extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   char *buffer);
+extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber nblocks);
+extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+#endif
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
new file mode 100644
index 00000000000..3a91d80b926
--- /dev/null
+++ b/contrib/zenith/pagestore_smgr.c
@@ -0,0 +1,930 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagestore_smgr.c
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  contrib/zenith/pagestore_smgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "pagestore_client.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "access/xlogdefs.h"
+#include "storage/bufmgr.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "replication/walsender.h"
+#include "catalog/pg_tablespace_d.h"
+
+/*
+ * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
+ * calls to md.c, and *also* do the calls to the Page Server. On every
+ * read, compare the versions we read from local disk and Page Server,
+ * and Assert that they are identical.
+ */
+/* #define DEBUG_COMPARE_LOCAL */
+
+#ifdef DEBUG_COMPARE_LOCAL
+#include "access/nbtree.h"
+#include "storage/bufpage.h"
+#include "storage/md.h"
+#include "access/xlog_internal.h"
+
+static char *hexdump_page(char *page);
+#endif
+
+const int SmgrTrace = DEBUG5;
+
+bool loaded = false;
+
+page_server_api *page_server;
+
+/* GUCs */
+char *page_server_connstring;
+char *callmemaybe_connstring;
+char *zenith_timeline;
+bool wal_redo = false;
+
+char const *const ZenithMessageStr[] =
+{
+	"ZenithExistsRequest",
+	"ZenithNblocksRequest",
+	"ZenithReadRequest",
+	"ZenithStatusResponse",
+	"ZenithReadResponse",
+	"ZenithNblocksResponse",
+};
+
+StringInfoData
+zm_pack(ZenithMessage *msg)
+{
+	StringInfoData	s;
+
+	initStringInfo(&s);
+	pq_sendbyte(&s, msg->tag);
+
+	switch (messageTag(msg))
+	{
+		/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		{
+			ZenithRequest *msg_req = (ZenithRequest *) msg;
+
+			pq_sendint32(&s, msg_req->page_key.rnode.spcNode);
+			pq_sendint32(&s, msg_req->page_key.rnode.dbNode);
+			pq_sendint32(&s, msg_req->page_key.rnode.relNode);
+			pq_sendbyte(&s, msg_req->page_key.forknum);
+			pq_sendint32(&s, msg_req->page_key.blkno);
+			pq_sendint64(&s, msg_req->lsn);
+
+			break;
+		}
+
+		/* pagestore -> pagestore_client */
+		case T_ZenithStatusResponse:
+		case T_ZenithNblocksResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			pq_sendbyte(&s, msg_resp->ok);
+			pq_sendint32(&s, msg_resp->n_blocks);
+			break;
+		}
+		case T_ZenithReadResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			pq_sendbyte(&s, msg_resp->ok);
+			pq_sendint32(&s, msg_resp->n_blocks);
+			pq_sendbytes(&s, msg_resp->page, BLCKSZ); // XXX: should be varlena
+			break;
+		}
+	}
+	return s;
+}
+
+ZenithMessage *
+zm_unpack(StringInfo s)
+{
+	ZenithMessageTag tag = pq_getmsgbyte(s);
+	ZenithMessage *msg = NULL;
+
+	switch (tag)
+	{
+		/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		{
+			ZenithRequest *msg_req = palloc0(sizeof(ZenithRequest));
+
+			msg_req->tag = tag;
+			msg_req->system_id = 42;
+			msg_req->page_key.rnode.spcNode = pq_getmsgint(s, 4);
+			msg_req->page_key.rnode.dbNode = pq_getmsgint(s, 4);
+			msg_req->page_key.rnode.relNode = pq_getmsgint(s, 4);
+			msg_req->page_key.forknum = pq_getmsgbyte(s);
+			msg_req->page_key.blkno = pq_getmsgint(s, 4);
+			msg_req->lsn = pq_getmsgint64(s);
+			pq_getmsgend(s);
+
+			msg = (ZenithMessage *) msg_req;
+			break;
+		}
+
+		/* pagestore -> pagestore_client */
+		case T_ZenithStatusResponse:
+		case T_ZenithNblocksResponse:
+		{
+			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse));
+
+			msg_resp->tag = tag;
+			msg_resp->ok = pq_getmsgbyte(s);
+			msg_resp->n_blocks = pq_getmsgint(s, 4);
+			pq_getmsgend(s);
+
+			msg = (ZenithMessage *) msg_resp;
+			break;
+		}
+
+		case T_ZenithReadResponse:
+		{
+			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse) + BLCKSZ);
+
+			msg_resp->tag = tag;
+			msg_resp->ok = pq_getmsgbyte(s);
+			msg_resp->n_blocks = pq_getmsgint(s, 4);
+			memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); // XXX: should be varlena
+			pq_getmsgend(s);
+
+			msg = (ZenithMessage *) msg_resp;
+			break;
+		}
+	}
+
+	return msg;
+}
+
+/* dump to json for debugging / error reporting purposes */
+char *
+zm_to_string(ZenithMessage *msg)
+{
+	StringInfoData	s;
+
+	initStringInfo(&s);
+
+	appendStringInfoString(&s, "{");
+	appendStringInfo(&s, "\"type\": \"%s\"", ZenithMessageStr[msg->tag]);
+
+	switch (messageTag(msg))
+	{
+		/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		{
+			ZenithRequest *msg_req = (ZenithRequest *) msg;
+
+			appendStringInfo(&s, ", \"page_key\": \"%d.%d.%d.%d.%u\", \"lsn\": \"%X/%X\"}",
+							 msg_req->page_key.rnode.spcNode,
+							 msg_req->page_key.rnode.dbNode,
+							 msg_req->page_key.rnode.relNode,
+							 msg_req->page_key.forknum,
+							 msg_req->page_key.blkno,
+							 (uint32) (msg_req->lsn >> 32), (uint32) (msg_req->lsn));
+
+			break;
+		}
+
+		/* pagestore -> pagestore_client */
+		case T_ZenithStatusResponse:
+		case T_ZenithNblocksResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+
+			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u}",
+				msg_resp->ok,
+				msg_resp->n_blocks
+			);
+
+			break;
+		}
+		case T_ZenithReadResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+
+			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u, \"page\": \"XXX\"}",
+				msg_resp->ok,
+				msg_resp->n_blocks
+			);
+			break;
+		}
+	}
+	return s.data;
+}
+
+
+static void
+zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
+{
+	XLogRecPtr lsn = PageGetLSN(buffer);
+
+	/*
+	 * If the page was not WAL-logged before eviction then we can lose its modification.
+	 * PD_WAL_LOGGED bit is used to mark pages which are wal-logged.
+	 *
+	 * See also comments to PD_WAL_LOGGED.
+	 *
+	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the whole index .
+	 * That's duplicative with the WAL-logging that we do here.
+	 * See log_newpage_range() calls.
+	 *
+	 * FIXME: Redoing this record will set the LSN on the page. That could
+	 * mess up the LSN-NSN interlock in GiST index build.
+	 */
+	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
+	{
+		/* FSM is never WAL-logged and we don't care. */
+		XLogRecPtr recptr;
+		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		XLogFlush(recptr);
+		lsn = recptr;
+		elog(SmgrTrace, "FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	}
+	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
+	{
+		/*
+		 * Always WAL-log vm.
+		 * We should never miss clearing visibility map bits.
+		 *
+		 * TODO Is it too bad for performance?
+		 * Hopefully we do not evict actively used vm too often.
+		 */
+		XLogRecPtr recptr;
+		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		XLogFlush(recptr);
+		lsn = recptr;
+
+		elog(SmgrTrace, "Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	}
+	else if (!(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED)
+		&& !RecoveryInProgress())
+	{
+		XLogRecPtr recptr;
+		/*
+		 * We assume standard page layout here.
+		 *
+		 * But at smgr level we don't really know what kind of a page this is.
+		 * We have filtered visibility map pages and fsm pages above.
+		 * TODO Do we have any special page types?
+		 */
+
+		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
+
+		/* If we wal-log hint bits, someone could concurrently update page
+		 * and reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
+		 *
+		 * See comment to FlushBuffer().
+		 * The caller must hold a pin on the buffer and have share-locked the
+		 * buffer contents.  (Note: a share-lock does not prevent updates of
+		 * hint bits in the buffer, so the page could change while the write
+		 * is in progress, but we assume that that will not invalidate the data
+		 * written.)
+		 */
+		Assert(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED); /* Should be set by log_newpage */
+
+		/*
+		 * Need to flush it too, so that it gets sent to the Page Server before we
+		 * might need to read it back. It should get flushed eventually anyway, at
+		 * least if there is some other WAL activity, so this isn't strictly
+		 * necessary for correctness. But if there is no other WAL activity, the
+		 * page read might get stuck waiting for the record to be streamed out
+		 * for an indefinite time.
+		 *
+		 * FIXME: Flushing the WAL is expensive. We should track the last "evicted"
+		 * LSN instead, and update it here. Or just kick the bgwriter to do the
+		 * flush, there is no need for us to block here waiting for it to finish.
+		 */
+		XLogFlush(recptr);
+		lsn = recptr;
+		elog(SmgrTrace, "Force wal logging of page %u of relation %u/%u/%u.%u, lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	} else {
+		elog(SmgrTrace, "Page %u of relation %u/%u/%u.%u is alread wal logged at lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	}
+	SetLastWrittenPageLSN(lsn);
+}
+
+
+
+/*
+ *	zenith_init() -- Initialize private state
+ */
+void
+zenith_init(void)
+{
+	/* noop */
+#ifdef DEBUG_COMPARE_LOCAL
+	mdinit();
+#endif
+}
+
+
+/*
+ * Return LSN for requesting pages and number of blocks from page server
+ */
+static XLogRecPtr
+zenith_get_request_lsn(bool nonrel)
+{
+	XLogRecPtr lsn;
+	XLogRecPtr flushlsn;
+
+	if (RecoveryInProgress())
+	{
+		lsn = GetXLogReplayRecPtr(NULL);
+		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
+			(uint32) ((lsn) >> 32), (uint32) (lsn));
+
+		lsn = InvalidXLogRecPtr;
+	}
+	else if (am_walsender)
+	{
+		lsn = InvalidXLogRecPtr;
+		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
+	}
+	else if (nonrel)
+	{
+		lsn = GetFlushRecPtr();
+		elog(DEBUG1, "zenith_get_request_lsn norel GetFlushRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
+	}
+	else
+	{
+		lsn = GetLastWrittenPageLSN();
+		flushlsn = GetFlushRecPtr();
+
+		/*
+		 * Use the latest LSN that was evicted from the buffer cache. Any
+		 * pages modified by later WAL records must still in the buffer cache,
+		 * so our request cannot concern those.
+		 */
+		lsn = GetLastWrittenPageLSN();
+		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
+			(uint32) ((lsn) >> 32), (uint32) (lsn));
+
+		if (lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * We haven't evicted anything yet since the server was
+			 * started. Then just use the latest flushed LSN. That's always
+			 * safe, using the latest evicted LSN is really just an
+			 * optimization.
+			 */
+			lsn = flushlsn;
+			elog(DEBUG1, "zenith_get_request_lsn GetFlushRecPtr lsn %X/%X",
+				 (uint32) ((lsn) >> 32), (uint32) (lsn));
+		}
+
+		/*
+		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
+		 * we shouldn't evict a page from the buffer cache before all its modifications have
+		 * been safely flushed. That's the "WAL before data" rule. But better safe than sorry.
+		 */
+		if (lsn > flushlsn)
+		{
+			elog(LOG, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+				 (uint32) (lsn >> 32), (uint32) lsn,
+				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
+			XLogFlush(lsn);
+		}
+	}
+	return lsn;
+}
+
+
+/*
+ *	zenith_exists() -- Does the physical file exist?
+ */
+bool
+zenith_exists(SMgrRelation reln, ForkNumber forkNum)
+{
+	bool		ok;
+	ZenithResponse *resp;
+
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithExistsRequest,
+		.page_key = {
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forkNum
+		},
+		.lsn = zenith_get_request_lsn(false)
+	});
+	ok = resp->ok;
+	pfree(resp);
+	return ok;
+}
+
+/*
+ *	zenith_create() -- Create a new relation on zenithd storage
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
+{
+	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forkNum);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdcreate(reln, forkNum, isRedo);
+#endif
+}
+
+/*
+ *	zenith_unlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
+{
+#ifdef DEBUG_COMPARE_LOCAL
+	mdunlink(rnode, forkNum, isRedo);
+#endif
+}
+
+/*
+ *	zenith_extend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			  char *buffer, bool skipFsync)
+{
+	XLogRecPtr lsn;
+
+	zenith_wallog_page(reln, forkNum, blkno, buffer);
+
+	lsn = PageGetLSN(buffer);
+	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forkNum, blkno,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdextend(reln, forkNum, blkno, buffer, skipFsync);
+#endif
+}
+
+/*
+ *  zenith_open() -- Initialize newly-opened relation.
+ */
+void
+zenith_open(SMgrRelation reln)
+{
+	/* no work */
+	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdopen(reln);
+#endif
+}
+
+/*
+ *	zenith_close() -- Close the specified relation, if it isn't closed already.
+ */
+void
+zenith_close(SMgrRelation reln, ForkNumber forknum)
+{
+	/* no work */
+	elog(SmgrTrace, "[ZENITH_SMGR] close noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdclose(reln, forknum);
+#endif
+}
+
+/*
+ *	zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	/* not implemented */
+	elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop");
+	return true;
+}
+
+/*
+ * zenith_writeback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+zenith_writeback(SMgrRelation reln, ForkNumber forknum,
+					  BlockNumber blocknum, BlockNumber nblocks)
+{
+	/* not implemented */
+	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdwriteback(reln, forknum, blocknum, nblocks);
+#endif
+}
+
+/*
+ *	zenith_read() -- Read the specified block from a relation.
+ */
+void
+zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+				 char *buffer)
+{
+	ZenithResponse *resp;
+	XLogRecPtr request_lsn;
+
+	request_lsn = zenith_get_request_lsn(false);
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithReadRequest,
+		.page_key = {
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forkNum,
+			.blkno = blkno
+		},
+		.lsn = request_lsn
+	});
+
+	if (!resp->ok)
+		ereport(ERROR,
+			(errcode(ERRCODE_IO_ERROR),
+			errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					blkno,
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode,
+					forkNum,
+					(uint32) (request_lsn >> 32), (uint32) request_lsn)));
+
+	memcpy(buffer, resp->page, BLCKSZ);
+	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED; /* Clear PD_WAL_LOGGED bit stored in WAL record */
+	pfree(resp);
+
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (forkNum == MAIN_FORKNUM)
+	{
+		char pageserver_masked[BLCKSZ];
+		char mdbuf[BLCKSZ];
+		char mdbuf_masked[BLCKSZ];
+
+		mdread(reln, forkNum, blkno, mdbuf);
+
+		memcpy(pageserver_masked, buffer, BLCKSZ);
+		memcpy(mdbuf_masked, mdbuf, BLCKSZ);
+
+		if (PageIsNew(mdbuf)) {
+			if (!PageIsNew(pageserver_masked)) {
+				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(buffer));
+			}
+		}
+		else if (PageIsNew(buffer)) {
+			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf));
+		}
+		else if (PageGetSpecialSize(mdbuf) == 0)
+		{
+			// assume heap
+			RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
+			RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
+
+			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf_masked),
+					 hexdump_page(pageserver_masked));
+			}
+		}
+		else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData)))
+		{
+			if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
+			{
+				// assume btree
+				RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
+				RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
+
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+						 blkno,
+						 reln->smgr_rnode.node.spcNode,
+						 reln->smgr_rnode.node.dbNode,
+						 reln->smgr_rnode.node.relNode,
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(mdbuf_masked),
+						 hexdump_page(pageserver_masked));
+				}
+			}
+		}
+	}
+#endif
+}
+
+#ifdef DEBUG_COMPARE_LOCAL
+static char *
+hexdump_page(char *page)
+{
+	StringInfoData result;
+
+	initStringInfo(&result);
+
+	for (int i = 0; i < BLCKSZ; i++)
+	{
+		if (i % 8 == 0)
+			appendStringInfo(&result, " ");
+		if (i % 40 == 0)
+			appendStringInfo(&result, "\n");
+		appendStringInfo(&result, "%02x", (unsigned char)(page[i]));
+	}
+
+	return result.data;
+}
+#endif
+
+
+bool
+zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum)
+{
+	bool ok;
+	ZenithResponse *resp;
+
+	elog(SmgrTrace, "[ZENITH_SMGR] zenith_nonrel_page_exists relnode %u/%u/%u_%d blkno %u",
+		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno);
+
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithExistsRequest,
+		.page_key = {
+			.rnode = rnode,
+			.forknum = forknum,
+			.blkno = blkno
+		},
+		.lsn = zenith_get_request_lsn(true)
+	});
+	ok = resp->ok;
+	pfree(resp);
+	return ok;
+}
+
+void
+zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum)
+{
+	int bufsize = BLCKSZ;
+	ZenithResponse *resp;
+	XLogRecPtr lsn;
+
+	//43 is magic for RELMAPPER_FILENAME in page cache
+	// relmapper files has non-standard size of 512bytes
+	if (forknum == 43)
+		bufsize = 512;
+
+	lsn = zenith_get_request_lsn(true);
+
+	elog(SmgrTrace, "[ZENITH_SMGR] read nonrel relnode %u/%u/%u_%d blkno %u lsn %X/%X",
+		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno,
+		(uint32) ((lsn) >> 32), (uint32) (lsn));
+
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithReadRequest,
+		.page_key = {
+			.rnode = rnode,
+			.forknum = forknum,
+			.blkno = blkno
+		},
+		.lsn = lsn
+	});
+
+	if (!resp->ok)
+		elog(ERROR, "[ZENITH_SMGR] smgr page not found");
+
+	memcpy(buffer, resp->page, bufsize);
+	pfree(resp);
+}
+
+
+/*
+ *	zenith_write() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			 char *buffer, bool skipFsync)
+{
+	XLogRecPtr lsn;
+
+	zenith_wallog_page(reln, forknum, blocknum, buffer);
+
+	lsn = PageGetLSN(buffer);
+	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forknum, blocknum,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+}
+
+/*
+ *	zenith_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	ZenithResponse *resp;
+	int			n_blocks;
+	XLogRecPtr request_lsn;
+
+	request_lsn = zenith_get_request_lsn(false);
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithNblocksRequest,
+		.page_key = {
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forknum,
+		},
+		.lsn = request_lsn
+	});
+	n_blocks = resp->n_blocks;
+
+	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forknum,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 n_blocks);
+
+	pfree(resp);
+	return n_blocks;
+}
+
+/*
+ *	zenith_truncate() -- Truncate relation to specified number of blocks.
+ */
+void
+zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+	XLogRecPtr lsn;
+
+	/*
+	 * Truncating a relation drops all its buffers from the buffer cache without
+	 * calling smgrwrite() on them. But we must account for that in our tracking
+	 * of last-written-LSN all the same: any future smgrnblocks() request must
+	 * return the new size after the truncation. We don't know what the LSN of
+	 * the truncation record was, so be conservative and use the most recently
+	 * inserted WAL record's LSN.
+	 */
+	lsn = GetXLogInsertRecPtr();
+
+	/*
+	 * Flush it, too. We don't actually care about it here, but let's uphold
+	 * the invariant that last-written LSN <= flush LSN.
+	 */
+	XLogFlush(lsn);
+
+	SetLastWrittenPageLSN(lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdtruncate(reln, forknum, nblocks);
+#endif
+}
+
+/*
+ *	zenith_immedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdimmedsync(reln, forknum);
+#endif
+}
+
+static const struct f_smgr zenith_smgr =
+{
+	.smgr_init = zenith_init,
+	.smgr_shutdown = NULL,
+	.smgr_open = zenith_open,
+	.smgr_close = zenith_close,
+	.smgr_create = zenith_create,
+	.smgr_exists = zenith_exists,
+	.smgr_unlink = zenith_unlink,
+	.smgr_extend = zenith_extend,
+	.smgr_prefetch = zenith_prefetch,
+	.smgr_read = zenith_read,
+	.smgr_write = zenith_write,
+	.smgr_writeback = zenith_writeback,
+	.smgr_nblocks = zenith_nblocks,
+	.smgr_truncate = zenith_truncate,
+	.smgr_immedsync = zenith_immedsync,
+};
+
+
+const f_smgr *
+smgr_zenith(BackendId backend, RelFileNode rnode)
+{
+
+	/* Don't use page server for temp relations */
+	if (backend != InvalidBackendId)
+		return smgr_standard(backend, rnode);
+	else
+		return &zenith_smgr;
+}
+
+void
+smgr_init_zenith(void)
+{
+	zenith_init();
+}
diff --git a/contrib/zenith/zenith.control b/contrib/zenith/zenith.control
new file mode 100644
index 00000000000..9aa5e2f067a
--- /dev/null
+++ b/contrib/zenith/zenith.control
@@ -0,0 +1,4 @@
+# zenith extension
+comment = 'cloud storage for PostgreSQL'
+default_version = '1.0'
+module_pathname = '$libdir/zenith'

From 871ee9820a79b401f5aa8717d15c6144dab90b16 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:13:11 +0300
Subject: [PATCH 003/214] [walredo] zenith_wal_redo.patch

Add WAL redo helper for zenith - alternative postgres operation mode to replay wal by pageserver request.

To start postgres in wal-redo mode, run postgres with --wal-redo option
It requires zenith shared library and zenith.wal_redo

Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/access/transam/xlog.c      |  14 +-
 src/backend/access/transam/xlogutils.c |  17 +
 src/backend/main/main.c                |   4 +
 src/backend/tcop/Makefile              |   2 +
 src/backend/tcop/zenith_wal_redo.c     | 647 +++++++++++++++++++++++++
 src/include/access/xlogutils.h         |   2 +
 src/include/tcop/tcopprot.h            |   4 +
 7 files changed, 688 insertions(+), 2 deletions(-)
 create mode 100644 src/backend/tcop/zenith_wal_redo.c

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 60e3ae6e506..bbf9899fc6a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10577,10 +10577,20 @@ xlog_redo(XLogReaderState *record)
 		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
 		{
 			Buffer		buffer;
+			XLogRedoAction result;
 
-			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
+			result = XLogReadBufferForRedo(record, block_id, &buffer);
+			if (result == BLK_DONE && !IsUnderPostmaster)
+			{
+				/*
+				 * In the special WAL process, blocks that are being ignored
+				 * return BLK_DONE. Accept that.
+				 */
+			}
+			else if (result != BLK_RESTORED)
 				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
-			UnlockReleaseBuffer(buffer);
+			if (buffer != InvalidBuffer)
+				UnlockReleaseBuffer(buffer);
 		}
 	}
 	else if (info == XLOG_BACKUP_END)
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f460..baf4dbed4aa 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -31,6 +31,8 @@
 #include "utils/rel.h"
 
 
+bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 /* GUC variable */
 bool		ignore_invalid_pages = false;
 
@@ -345,6 +347,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 		elog(PANIC, "failed to locate backup block with ID %d", block_id);
 	}
 
+	if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id))
+	{
+		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+		{
+			*buf = ReadBufferWithoutRelcache(rnode, forknum,
+											 blkno, mode, NULL);
+			return BLK_DONE;
+		}
+		else
+		{
+			*buf = InvalidBuffer;
+			return BLK_DONE;
+		}
+	}
+
 	/*
 	 * Make sure that if the block is marked with WILL_INIT, the caller is
 	 * going to initialize it. And vice versa.
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 350ef5b7d95..aa74591e222 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -205,6 +205,10 @@ main(int argc, char *argv[])
 		PostgresMain(argc, argv,
 					 NULL,		/* no dbname */
 					 strdup(get_user_name_or_exit(progname)));	/* does not return */
+	else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0)
+		WalRedoMain(argc, argv,
+					 NULL,		/* no dbname */
+					 strdup(get_user_name_or_exit(progname)));	/* does not return */
 	else
 		PostmasterMain(argc, argv); /* does not return */
 	abort();					/* should not get here */
diff --git a/src/backend/tcop/Makefile b/src/backend/tcop/Makefile
index f662a7dd1cf..84f027436a4 100644
--- a/src/backend/tcop/Makefile
+++ b/src/backend/tcop/Makefile
@@ -20,4 +20,6 @@ OBJS = \
 	pquery.o \
 	utility.o
 
+OBJS += zenith_wal_redo.o
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
new file mode 100644
index 00000000000..4503648fc3e
--- /dev/null
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -0,0 +1,647 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenith_wal_redo.c
+ *	  Entry point for WAL redo helper
+ *
+ *
+ * This file contains an alternative main() function for the 'postgres'
+ * binary. In the special mode, we go into a special mode that's similar
+ * to the single user mode. We don't launch postmaster or any auxiliary
+ * processes. Instead, we wait for command from 'stdin', and respond to
+ * 'stdout'.
+ *
+ * There's a TAP test for this in contrib/zenith_store/t/002_wal_redo_helper.pl
+ *
+ * The protocol through stdin/stdout is loosely based on the libpq protocol.
+ * The process accepts messages through stdin, and each message has the format:
+ *
+ * char   msgtype;
+ * int32  length; // length of message including 'length' but excluding
+ *                // 'msgtype', in network byte order
+ * <payload>
+ *
+ * There are three message types:
+ *
+ * BeginRedoForBlock ('B'): Prepare for WAL replay for given block
+ * PushPage ('P'): Copy a page image (in the payload) to buffer cache
+ * ApplyRecord ('A'): Apply a WAL record (in the payload)
+ * GetPage ('G'): Return a page image from buffer cache.
+ *
+ * Currently, you only get a response to GetPage requests; the response is
+ * simply a 8k page, without any headers. Errors are logged to stderr.
+ *
+ * FIXME:
+ * - this currently requires a valid PGDATA, and creates a lock file there
+ *   like a normal postmaster. There's no fundamental reason for that, though.
+ * - should have EndRedoForBlock, and flush page cache, to allow using this
+ *   mechanism for more than one block without restarting the process.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/tcop/zenith_wal_redo.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#ifndef HAVE_GETRUSAGE
+#include "rusagestub.h"
+#endif
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "storage/ipc.h"
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+
+static int	ReadRedoCommand(StringInfo inBuf);
+static void BeginRedoForBlock(StringInfo input_message);
+static void PushPage(StringInfo input_message);
+static void ApplyRecord(StringInfo input_message);
+static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
+static void GetPage(StringInfo input_message);
+
+static BufferTag target_redo_tag;
+
+#define TRACE DEBUG5
+
+/* ----------------------------------------------------------------
+ * FIXME comment
+ * PostgresMain
+ *	   postgres main loop -- all backends, interactive or otherwise start here
+ *
+ * argc/argv are the command line arguments to be used.  (When being forked
+ * by the postmaster, these are not the original argv array of the process.)
+ * dbname is the name of the database to connect to, or NULL if the database
+ * name should be extracted from the command line arguments or defaulted.
+ * username is the PostgreSQL user name to be used for the session.
+ * ----------------------------------------------------------------
+ */
+void
+WalRedoMain(int argc, char *argv[],
+			const char *dbname,
+			const char *username)
+{
+	int			firstchar;
+	StringInfoData input_message;
+
+	/* Initialize startup process environment if necessary. */
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/*
+	 * Parse command-line options.
+	 * TODO
+	 */
+	//process_postgres_switches(argc, argv, PGC_POSTMASTER, &dbname);
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		proc_exit(1);
+
+	/*
+	 * Set up signal handlers.  (InitPostmasterChild or InitStandaloneProcess
+	 * has already set up BlockSig and made that the active signal mask.)
+	 *
+	 * Note that postmaster blocked all signals before forking child process,
+	 * so there is no race condition whereby we might receive a signal before
+	 * we have set up the handler.
+	 *
+	 * Also note: it's best not to use any signals that are SIG_IGNored in the
+	 * postmaster.  If such a signal arrives before we are able to change the
+	 * handler to non-SIG_IGN, it'll get dropped.  Instead, make a dummy
+	 * handler in the postmaster to reserve the signal. (Of course, this isn't
+	 * an issue for signals that are locally generated, such as SIGALRM and
+	 * SIGPIPE.)
+	 */
+#if 0
+	if (am_walsender)
+		WalSndSignals();
+	else
+	{
+		pqsignal(SIGHUP, SignalHandlerForConfigReload);
+		pqsignal(SIGINT, StatementCancelHandler);	/* cancel current query */
+		pqsignal(SIGTERM, die); /* cancel current query and exit */
+
+		/*
+		 * In a postmaster child backend, replace SignalHandlerForCrashExit
+		 * with quickdie, so we can tell the client we're dying.
+		 *
+		 * In a standalone backend, SIGQUIT can be generated from the keyboard
+		 * easily, while SIGTERM cannot, so we make both signals do die()
+		 * rather than quickdie().
+		 */
+		if (IsUnderPostmaster)
+			pqsignal(SIGQUIT, quickdie);	/* hard crash time */
+		else
+			pqsignal(SIGQUIT, die); /* cancel current query and exit */
+		InitializeTimeouts();	/* establishes SIGALRM handler */
+
+		/*
+		 * Ignore failure to write to frontend. Note: if frontend closes
+		 * connection, we will notice it and exit cleanly when control next
+		 * returns to outer loop.  This seems safer than forcing exit in the
+		 * midst of output during who-knows-what operation...
+		 */
+		pqsignal(SIGPIPE, SIG_IGN);
+		pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+		pqsignal(SIGUSR2, SIG_IGN);
+		pqsignal(SIGFPE, FloatExceptionHandler);
+
+		/*
+		 * Reset some signals that are accepted by postmaster but not by
+		 * backend
+		 */
+		pqsignal(SIGCHLD, SIG_DFL); /* system() requires this on some
+									 * platforms */
+	}
+#endif
+
+	/*
+	 * Validate we have been given a reasonable-looking DataDir and change into it.
+	 */
+	checkDataDir();
+	ChangeToDataDir();
+
+	/*
+	 * Create lockfile for data directory.
+	 */
+	CreateDataDirLockFile(false);
+
+	/* read control file (error checking and contains config ) */
+	LocalProcessControlFile(false);
+
+	process_shared_preload_libraries();
+
+	/* Initialize MaxBackends (if under postmaster, was done already) */
+	InitializeMaxBackends();
+
+	/* Early initialization */
+	BaseInit();
+
+	/*
+	 * Create a per-backend PGPROC struct in shared memory. We must do
+	 * this before we can use LWLocks.
+	 */
+	InitAuxiliaryProcess();
+
+	SetProcessingMode(NormalProcessing);
+
+	/* Redo routines won't work if we're not "in recovery" */
+	InRecovery = true;
+
+	/*
+	 * Create the memory context we will use in the main loop.
+	 *
+	 * MessageContext is reset once per iteration of the main loop, ie, upon
+	 * completion of processing of each command message from the client.
+	 */
+	MessageContext = AllocSetContextCreate(TopMemoryContext,
+										   "MessageContext",
+										   ALLOCSET_DEFAULT_SIZES);
+
+	/* we need a ResourceOwner to hold buffer pins */
+	Assert(CurrentResourceOwner == NULL);
+	CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo");
+
+	/* Initialize resource managers */
+	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+	{
+		if (RmgrTable[rmid].rm_startup != NULL)
+			RmgrTable[rmid].rm_startup();
+	}
+
+	/*
+	 * Main processing loop
+	 */
+	for (;;)
+	{
+		/*
+		 * Release storage left over from prior query cycle, and create a new
+		 * query input buffer in the cleared MessageContext.
+		 */
+		MemoryContextSwitchTo(MessageContext);
+		MemoryContextResetAndDeleteChildren(MessageContext);
+
+		initStringInfo(&input_message);
+
+		set_ps_display("idle");
+
+		/*
+		 * (3) read a command (loop blocks here)
+		 */
+		firstchar = ReadRedoCommand(&input_message);
+
+		switch (firstchar)
+		{
+			case 'B':			/* BeginRedoForBlock */
+				BeginRedoForBlock(&input_message);
+				break;
+
+			case 'P':			/* PushPage */
+				PushPage(&input_message);
+				break;
+
+			case 'A':			/* ApplyRecord */
+				ApplyRecord(&input_message);
+				break;
+
+			case 'G':			/* GetPage */
+				GetPage(&input_message);
+				break;
+
+				/*
+				 * EOF means we're done. Perform normal shutdown.
+				 */
+			case EOF:
+
+				/*
+				 * NOTE: if you are tempted to add more code here, DON'T!
+				 * Whatever you had in mind to do should be set up as an
+				 * on_proc_exit or on_shmem_exit callback, instead. Otherwise
+				 * it will fail to be called during other backend-shutdown
+				 * scenarios.
+				 */
+				proc_exit(0);
+
+			default:
+				ereport(FATAL,
+						(errcode(ERRCODE_PROTOCOL_VIOLATION),
+						 errmsg("invalid frontend message type %d",
+								firstchar)));
+		}
+	}							/* end of input-reading loop */
+}
+
+/*
+ * Some debug function that may be handy for now.
+ */
+pg_attribute_unused()
+static char *
+pprint_buffer(char *data, int len)
+{
+	StringInfoData s;
+	initStringInfo(&s);
+	appendStringInfo(&s, "\n");
+	for (int i = 0; i < len; i++) {
+
+		appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) );
+		if (i % 32 == 31) {
+			appendStringInfo(&s, "\n");
+		}
+	}
+	appendStringInfo(&s, "\n");
+
+	return s.data;
+}
+
+static char *
+pprint_tag(BufferTag *tag)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	appendStringInfo(&s, "%u/%u/%u.%d blk %u",
+		tag->rnode.spcNode,
+		tag->rnode.dbNode,
+		tag->rnode.relNode,
+		tag->forkNum,
+		tag->blockNum
+	);
+
+	return s.data;
+}
+/* ----------------------------------------------------------------
+ *		routines to obtain user input
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Read next command from the client.
+ *
+ *	the string entered by the user is placed in its parameter inBuf,
+ *	and we act like a Q message was received.
+ *
+ *	EOF is returned if end-of-file input is seen; time to shut down.
+ * ----------------
+ */
+
+/*
+ * Wait until there is data in stdin. Prints a log message every 10 s whil
+ * waiting.
+ */
+static void
+wait_with_timeout(void)
+{
+	for (;;)
+	{
+		struct timeval timeout = {10, 0};
+		fd_set		fds;
+		int			ret;
+
+		FD_ZERO(&fds);
+		FD_SET(STDIN_FILENO, &fds);
+
+		ret = select(1, &fds, NULL, NULL, &timeout);
+		if (ret != 0)
+			break;
+		elog(DEBUG1, "still alive");
+	}
+}
+
+static int
+ReadRedoCommand(StringInfo inBuf)
+{
+	char		c;
+	int			qtype;
+	int32		len;
+	int			nread;
+
+	/* FIXME: Use unbuffered I/O here, because the WAL redo process was getting
+	 * stuck with buffered I/O. I'm not sure why, or whether the bug was somewhere
+	 * in here or in the calling page server side.
+	 */
+	wait_with_timeout();
+	if (read(STDIN_FILENO, &c, 1) == 0)
+		return EOF;
+	qtype = c;
+
+	/*
+	 * Like in the FE/BE protocol, all messages have a length word next
+	 * after the type code; we can read the message contents independently of
+	 * the type.
+	 */
+	if (read(STDIN_FILENO, &len, 4) != 4)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("could not read message length")));
+	}
+
+	len = pg_ntoh32(len);
+
+	if (len < 4)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("invalid message length")));
+		return EOF;
+	}
+
+	len -= 4;					/* discount length itself */
+
+	enlargeStringInfo(inBuf, len);
+	nread = 0;
+	while (nread < len) {
+		int n = read(STDIN_FILENO, inBuf->data + nread, len - nread);
+		if (n == -1)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("read error: %m")));
+		if (n == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("unexpected EOF")));
+		nread += n;
+	}
+	inBuf->len = len;
+	inBuf->data[len] = '\0';
+
+	return qtype;
+}
+
+
+/*
+ * Prepare for WAL replay on given block
+ */
+static void
+BeginRedoForBlock(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	MemoryContext oldcxt;
+	SMgrRelation reln;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+
+	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
+
+	{
+		char* buf = pprint_tag(&target_redo_tag);
+		elog(TRACE, "BeginRedoForBlock %s", buf);
+		pfree(buf);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	reln = smgropen(rnode, InvalidBackendId);
+	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
+		reln->smgr_cached_nblocks[forknum] < blknum + 1)
+	{
+		reln->smgr_cached_nblocks[forknum] = blknum + 1;
+	}
+}
+
+/*
+ * Receive a page given by the client, and put it into buffer cache.
+ */
+static void
+PushPage(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	const char *content;
+	Buffer		buf;
+	Page		page;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 * 8k page content
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+	content = pq_getmsgbytes(input_message, BLCKSZ);
+
+	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL);
+	page = BufferGetPage(buf);
+	memcpy(page, content, BLCKSZ);
+	MarkBufferDirty(buf); /* pro forma */
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Receive a WAL record, and apply it.
+ *
+ * All the pages should be loaded into the buffer cache by PushPage calls already.
+ */
+static void
+ApplyRecord(StringInfo input_message)
+{
+	/* recovery here */
+	char	   *errormsg;
+	XLogRecPtr	lsn;
+	XLogRecord *record;
+	int			nleft;
+	XLogReaderState reader_state;
+
+	/*
+	 * message format:
+	 *
+	 * LSN (the *end* of the record)
+	 * record
+	 */
+	lsn = pq_getmsgint64(input_message);
+
+	/* note: the input must be aligned here */
+	record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
+
+	nleft = input_message->len - input_message->cursor;
+	if (record->xl_tot_len != sizeof(XLogRecord) + nleft)
+		elog(ERROR, "mismatch between record (%d) and message size (%d)",
+			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
+
+	/* FIXME: use XLogReaderAllocate() */
+	memset(&reader_state, 0, sizeof(XLogReaderState));
+	reader_state.ReadRecPtr = 0; /* no 'prev' record */
+	reader_state.EndRecPtr = lsn; /* this record */
+	reader_state.decoded_record = record;
+	reader_state.errormsg_buf = palloc(1000 + 1); /* MAX_ERRORMSG_LEN */
+
+	if (!DecodeXLogRecord(&reader_state, record, &errormsg))
+		elog(ERROR, "failed to decode WAL record: %s", errormsg);
+
+	/* Ignore any other blocks than the ones the caller is interested in */
+	redo_read_buffer_filter = redo_block_filter;
+
+	RmgrTable[record->xl_rmid].rm_redo(&reader_state);
+
+	redo_read_buffer_filter = NULL;
+
+	elog(TRACE, "applied WAL record with LSN %X/%X",
+		 (uint32) (lsn >> 32), (uint32) lsn);
+}
+
+static bool
+redo_block_filter(XLogReaderState *record, uint8 block_id)
+{
+	BufferTag	target_tag;
+
+	if (!XLogRecGetBlockTag(record, block_id,
+							&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum))
+	{
+		/* Caller specified a bogus block_id */
+		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+	}
+
+	/*
+	 * If this block isn't one we are currently restoring, then return 'true'
+	 * so that this gets ignored
+	 */
+	return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
+}
+
+/*
+ * Get a page image back from buffer cache.
+ *
+ * After applying some records.
+ */
+static void
+GetPage(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	Buffer		buf;
+	Page		page;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+
+	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
+
+	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_NORMAL, NULL);
+	page = BufferGetPage(buf);
+	/* single thread, so don't bother locking the page */
+
+	/* Response: Page content */
+	fwrite(page, 1, BLCKSZ, stdout); /* FIXME: check errors */
+	fflush(stdout);
+
+	ReleaseBuffer(buf);
+	DropDatabaseBuffers(rnode.dbNode);
+	smgrinit(); //reset inmem smgr state
+
+	elog(TRACE, "Page sent back for block %u", blknum);
+}
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 9ac602b674d..7cebdf3af6d 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -33,6 +33,8 @@ typedef enum
 								 * need to be replayed) */
 } XLogRedoAction;
 
+extern bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record,
 											uint8 buffer_id, Buffer *buf);
 extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 968345404e5..9da6e8768ab 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -88,4 +88,8 @@ extern bool set_plan_disabling_options(const char *arg,
 									   GucContext context, GucSource source);
 extern const char *get_stats_option_name(const char *arg);
 
+extern void WalRedoMain(int argc, char *argv[],
+						const char *dbname,
+						const char *username);
+
 #endif							/* TCOPPROT_H */

From 0b90dab95a49d40711880ce48436f8776e74c810 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:20:37 +0300
Subject: [PATCH 004/214] lastWrittenPageLSN.patch

Save lastWrittenPageLSN in XLogCtlData to know what pages to request from remote pageserver.

Authors:
Konstantin Knizhnik <knizhnik@garret.ru>
Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/access/transam/xlog.c | 30 ++++++++++++++++++++++++++++++
 src/backend/commands/dbcommands.c |  7 ++++---
 src/include/access/xlog.h         |  3 +++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bbf9899fc6a..1a13af5c2da 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -741,6 +741,7 @@ typedef struct XLogCtlData
 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
+	XLogRecPtr  lastWrittenPageLSN;
 
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
@@ -8716,6 +8717,35 @@ GetInsertRecPtr(void)
 	return recptr;
 }
 
+/*
+ * GetLastWrittenPageLSN -- Returns maximal LSN of written page
+ */
+XLogRecPtr
+GetLastWrittenPageLSN(void)
+{
+	XLogRecPtr lsn;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lsn = XLogCtl->lastWrittenPageLSN;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return lsn;
+}
+
+/*
+ * SetLastWrittenPageLSN -- Set maximal LSN of written page
+ */
+void
+SetLastWrittenPageLSN(XLogRecPtr lsn)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	if (lsn > XLogCtl->lastWrittenPageLSN)
+		XLogCtl->lastWrittenPageLSN = lsn;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
+
+
+
 /*
  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
  * position known to be fsync'd to disk.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 974c5777bc3..cef6d4aea62 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -674,7 +674,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			/* Record the filesystem change in XLOG */
 			{
 				xl_dbase_create_rec xlrec;
-
+				XLogRecPtr lsn;
 				xlrec.db_id = dboid;
 				xlrec.tablespace_id = dsttablespace;
 				xlrec.src_db_id = src_dboid;
@@ -683,8 +683,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 				XLogBeginInsert();
 				XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_rec));
 
-				(void) XLogInsert(RM_DBASE_ID,
-								  XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
+				lsn = XLogInsert(RM_DBASE_ID,
+								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
+				SetLastWrittenPageLSN(lsn);
 			}
 		}
 		table_endscan(scan);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ee3e369b79f..8b8b14d2fd0 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -350,6 +350,9 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
+extern XLogRecPtr GetLastWrittenPageLSN(void);
+
 extern bool PromoteIsTriggered(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);

From 5d1d4aa4f346092e2e28da70b023cf00fd03ce8e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 21 May 2021 23:57:08 +0300
Subject: [PATCH 005/214] Fix GetPage requests right after replaying CREATE
 DATABASE

In the test_createdb test, we created a new database, and created a new
branch after that. I was seeing the test fail with:

    PANIC:  could not open critical system index 2662

The WAL contained records like this:

    rmgr: XLOG        len (rec/tot):     49/  8241, tx:          0, lsn: 0/0163E8F0, prev 0/0163C8A0, desc: FPI , blkref #0: rel 1663/12985/1249 fork fsm blk 1 FPW
    rmgr: XLOG        len (rec/tot):     49/  8241, tx:          0, lsn: 0/01640940, prev 0/0163E8F0, desc: FPI , blkref #0: rel 1663/12985/1249 fork fsm blk 2 FPW
    rmgr: Standby     len (rec/tot):     54/    54, tx:          0, lsn: 0/01642990, prev 0/01640940, desc: RUNNING_XACTS nextXid 541 latestCompletedXid 539 oldestRunningXid 540; 1 xacts: 540
    rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/016429C8, prev 0/01642990, desc: CHECKPOINT_ONLINE redo 0/163C8A0; tli 1; prev tli 1; fpw true; xid 0:541; oid 24576; multi 1; offset 0; oldest xid 532 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 540; online
    rmgr: Database    len (rec/tot):     42/    42, tx:        540, lsn: 0/01642A40, prev 0/016429C8, desc: CREATE copy dir 1663/1 to 1663/16390
    rmgr: Standby     len (rec/tot):     54/    54, tx:          0, lsn: 0/01642A70, prev 0/01642A40, desc: RUNNING_XACTS nextXid 541 latestCompletedXid 539 oldestRunningXid 540; 1 xacts: 540
    rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/01642AA8, prev 0/01642A70, desc: CHECKPOINT_ONLINE redo 0/1642A70; tli 1; prev tli 1; fpw true; xid 0:541; oid 24576; multi 1; offset 0; oldest xid 532 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 540; online
    rmgr: Transaction len (rec/tot):     66/    66, tx:        540, lsn: 0/01642B20, prev 0/01642AA8, desc: COMMIT 2021-05-21 15:55:46.363728 EEST; inval msgs: catcache 21; sync
    rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/01642B68, prev 0/01642B20, desc: CHECKPOINT_SHUTDOWN redo 0/1642B68; tli 1; prev tli 1; fpw true; xid 0:541; oid 24576; multi 1; offset 0; oldest xid 532 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 0; shutdown

The compute node had correctly replayed all the WAL up to the last
record, and opened up. But when you tried to connect to the new
database, the very first requests for the critical relations, like
pg_class, were made with request LSN 0/01642990. That's the last
record that's applicable to a particular block. Because the database
CREATE record didn't bump up the "last written LSN", the getpage
requests were made with too old LSN.

I fixed this by adding a SetLastWrittenLSN() call to the redo of
database CREATE record. It probably wouldn't hurt to also throw in a
call at the end of WAL replay, but let's see if we bump into more
cases like this first.

This doesn't seem to be happening with page server as of 'main'; I was
testing with a version where I had temporarily reverted all the recent
changes to reconstruct control file, checkpoints, relmapper files
etc. from the WAL records in the page server, so that the compute node
was redoing all the WAL. I'm pretty sure we need this fix even with
'main', even though this test case wasn't failing there right now.
---
 src/backend/commands/dbcommands.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index cef6d4aea62..97a58988fb0 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -2356,6 +2356,16 @@ dbase_redo(XLogReaderState *record)
 		 * We don't need to copy subdirectories
 		 */
 		copydir(src_path, dst_path, false);
+
+		/*
+		 * Make sure any future requests to the page server see the new
+		 * database.
+		 */
+		{
+			XLogRecPtr	lsn = record->EndRecPtr;
+
+			SetLastWrittenPageLSN(lsn);
+		}
 	}
 	else if (info == XLOG_DBASE_DROP)
 	{

From 42b8f43e35b146ff33744d58d0d0da9caa28bd8c Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:26:04 +0300
Subject: [PATCH 006/214] handle_eviction_of_non_wal_logged_pages.patch

Some operations in PostgreSQL are not WAL-logged at all (i.e. hint bits)
or delay wal-logging till the end of operation (i.e. index build).
So if such page is evicted, we will lose the update.

To fix it, we introduce PD_WAL_LOGGED bit to track whether the page was wal-logged.
If the page is evicted before it has been wal-logged, then zenith smgr creates FPI for it.

Authors:
Konstantin Knizhnik <knizhnik@garret.ru>
anastasia <lubennikovaav@gmail.com>
---
 src/backend/access/common/bufmask.c     |  2 ++
 src/backend/access/gist/gistutil.c      |  2 ++
 src/backend/access/transam/xloginsert.c | 15 ++++++++++++++-
 src/backend/storage/buffer/bufmgr.c     | 14 ++++++++++++++
 src/backend/storage/page/bufpage.c      |  2 +-
 src/include/storage/bufpage.h           | 19 ++++++++++++++++++-
 6 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index 409acecf42a..ce5d9c0341e 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -54,6 +54,8 @@ mask_page_hint_bits(Page page)
 	PageClearFull(page);
 	PageClearHasFreeLinePointers(page);
 
+	phdr->pd_flags &= ~PD_WAL_LOGGED;
+
 	/*
 	 * During replay, if the page LSN has advanced past our XLOG record's LSN,
 	 * we don't mark the page all-visible. See heap_xlog_visible() for
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 43ba03b6eb9..1a1bb4a53f6 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -866,6 +866,8 @@ gistNewBuffer(Relation r)
 				if (XLogStandbyInfoActive() && RelationNeedsWAL(r))
 					gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page));
 
+				((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
+
 				return buffer;
 			}
 
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 134c78f12b5..8d7895af6f3 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -239,6 +239,7 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
+	((PageHeader)regbuf->page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -294,6 +295,7 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
+	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -1185,7 +1187,18 @@ log_newpage_range(Relation rel, ForkNumber forkNum,
 			MarkBufferDirty(bufpack[i]);
 		}
 
-		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+		/*
+		 * Zenith forces WAL logging of evicted pages,
+		 * so it can happen that in some cases when pages are first
+		 * modified and then WAL logged (for example building GiST/GiN
+		 * indexes) there are no more pages which need to be WAL logged at
+		 * the end of build procedure. As far as XLogInsert throws error
+		 * if not records were inserted, we need to reset the insert state.
+		 */
+		if (nbufs > 0)
+			recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+		else
+			XLogResetInsertion();
 
 		for (i = 0; i < nbufs; i++)
 		{
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index adb39dada4c..b9b94fa71eb 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1619,6 +1619,11 @@ MarkBufferDirty(Buffer buffer)
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
+	/*
+	 * Clear PD_WAL_LOGGED flag so that if dirty page is evicted from page pool
+	 * before been WAL logged, FPI WAL record will be enforced.
+	 */
+	((PageHeader)BufferGetPage(buffer))->pd_flags &= ~PD_WAL_LOGGED;
 }
 
 /*
@@ -2008,6 +2013,15 @@ BufferSync(int flags)
 			item->blockNum = bufHdr->tag.blockNum;
 		}
 
+		/* Zenith XXX
+		 * Consider marking this page as not WAL-logged,
+		 * so that pagestore_smgr issued a log record before eviction
+		 * and persisted hint changes.
+		 * TODO: check performance impacts of this approach
+		 * since extra wal-logging may worsen the performance.
+		 */
+		//((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
+
 		UnlockBufHdr(bufHdr, buf_state);
 
 		/* Check for barrier events in case NBuffers is large. */
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 82ca91f5977..48dc7bde265 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -427,7 +427,7 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 
 	pageSize = PageGetPageSize(tempPage);
 	memcpy((char *) oldPage, (char *) tempPage, pageSize);
-
+	((PageHeader)oldPage)->pd_flags &= ~PD_WAL_LOGGED;
 	pfree(tempPage);
 }
 
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index c86ccdaf608..6704f69f328 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -182,7 +182,24 @@ typedef PageHeaderData *PageHeader;
 #define PD_ALL_VISIBLE		0x0004	/* all tuples on page are visible to
 									 * everyone */
 
-#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
+/* Zenith XXX:
+ * Some operations in PostgreSQL are not WAL-logged at all (i.e. hint bits)
+ * or delay wal-logging till the end of operation (i.e. index build).
+ *
+ * So if such page is evicted, we will lose the update.
+ * To fix it, we introduce PD_WAL_LOGGED bit to track whether the page was wal-logged.
+ * If page is evicted before it has been wal-logged, then pagestore_smgr creates FPI for it.
+ *
+ * List of such operations:
+ * - GIN/GiST/SP-GiST index build
+ * - page and heaptuple hint bits
+ * - Clearing visibility map bits
+ * - FSM changes
+ * - ???
+ */
+#define PD_WAL_LOGGED       0x0008  /* Page is wal-logged */
+#define PD_VALID_FLAG_BITS	0x000F	/* OR of all valid pd_flags bits */
+
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.

From 8eba45835b40a736032ca48e75c5ae9ef4257eb0 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:33:46 +0300
Subject: [PATCH 007/214] [walproposer] wal_proposer.patch

Add WalProposer background worker to broadcast WAL stream to Zenith WAL acceptors

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/postmaster/bgworker.c           |   4 +
 src/backend/postmaster/postmaster.c         |   6 +
 src/backend/replication/Makefile            |   4 +-
 src/backend/replication/walproposer.c       | 873 ++++++++++++++++++++
 src/backend/replication/walproposer_utils.c | 237 ++++++
 src/backend/replication/walsender.c         | 103 ++-
 src/backend/utils/misc/guc.c                |  25 +
 src/include/replication/walproposer.h       | 174 ++++
 8 files changed, 1399 insertions(+), 27 deletions(-)
 create mode 100644 src/backend/replication/walproposer.c
 create mode 100644 src/backend/replication/walproposer_utils.c
 create mode 100644 src/include/replication/walproposer.h

diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index c40410d73ea..2be49df0eb0 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -22,6 +22,7 @@
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
+#include "replication/walproposer.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
@@ -128,6 +129,9 @@ static const struct
 	},
 	{
 		"ApplyWorkerMain", ApplyWorkerMain
+	},
+	{
+		"WalProposerMain", WalProposerMain
 	}
 };
 
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 0a4533251ba..04694ae0583 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -117,6 +117,7 @@
 #include "postmaster/syslogger.h"
 #include "replication/logicallauncher.h"
 #include "replication/walsender.h"
+#include "replication/walproposer.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
@@ -997,6 +998,11 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	ApplyLauncherRegister();
 
+	/*
+	 * Start WAL proposer bgworker is wal acceptors list is not empty
+	 */
+	WalProposerRegister();
+
 	/*
 	 * process any libraries that should be preloaded at postmaster start
 	 */
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index a0381e52f31..23731a07576 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -24,7 +24,9 @@ OBJS = \
 	syncrep_gram.o \
 	walreceiver.o \
 	walreceiverfuncs.o \
-	walsender.o
+	walsender.o \
+	walproposer.o \
+	walproposer_utils.o
 
 SUBDIRS = logical
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
new file mode 100644
index 00000000000..564defc024a
--- /dev/null
+++ b/src/backend/replication/walproposer.c
@@ -0,0 +1,873 @@
+/*-------------------------------------------------------------------------
+ *
+ * walproposer.c
+ *
+ * Broadcast WAL stream to Zenith WAL acceptetors
+ */
+#include <signal.h>
+#include <unistd.h>
+#include "replication/walproposer.h"
+#include "storage/latch.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "access/xlog.h"
+#include "replication/walreceiver.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "storage/pmsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+char* wal_acceptors_list;
+int   wal_acceptor_reconnect_timeout;
+bool  am_wal_proposer;
+
+static int          n_walkeepers = 0;
+static int          quorum = 0;
+static WalKeeper    walkeeper[MAX_WALKEEPERS];
+static WalMessage*  msgQueueHead;
+static WalMessage*  msgQueueTail;
+static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
+static XLogRecPtr	lastSentVCLLsn;	/* VCL replies have been sent to walkeeper up to here */
+static ServerInfo   serverInfo;
+static WaitEventSet* waitEvents;
+static WalKeeperResponse lastFeedback;
+static XLogRecPtr   restartLsn; /* Last position received by all walkeepers. */
+static RequestVote  prop;       /* Vote request for walkeeper */
+static int          leader;     /* Most advanced walkeeper */
+static int          n_votes = 0;
+static int          n_connected = 0;
+static TimestampTz  last_reconnect_attempt;
+
+/*
+ * Combine hot standby feedbacks from all walkeepers.
+ */
+static void
+CombineHotStanbyFeedbacks(HotStandbyFeedback* hs)
+{
+	hs->ts = 0;
+	hs->xmin.value = ~0; /* largest unsigned value */
+	hs->catalog_xmin.value = ~0; /* largest unsigned value */
+
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.hs.ts != 0)
+		{
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
+			{
+				hs->xmin = walkeeper[i].feedback.hs.xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
+			{
+				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+		}
+	}
+}
+
+static void
+ResetWalProposerEventSet(void)
+{
+	if (waitEvents)
+		FreeWaitEventSet(waitEvents);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
+	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].sock != PGINVALID_SOCKET)
+		{
+			int events;
+			switch (walkeeper[i].state)
+			{
+				case SS_SEND_WAL:
+					events = WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE;
+					break;
+				case SS_CONNECTING:
+					events = WL_SOCKET_WRITEABLE;
+					break;
+				default:
+					events = WL_SOCKET_READABLE;
+					break;
+			}
+			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, events, walkeeper[i].sock, NULL, &walkeeper[i]);
+		}
+	}
+}
+
+/*
+ * This function is called to establish new connection or to reestablish connection in case
+ * of connection failure.
+ * Close current connection if any and try to initiate new one
+ */
+static void
+ResetConnection(int i)
+{
+	bool established;
+
+	if (walkeeper[i].state != SS_OFFLINE)
+	{
+		elog(WARNING, "Connection with node %s:%s failed: %m",
+			walkeeper[i].host, walkeeper[i].port);
+
+		/* Close old connection */
+		closesocket(walkeeper[i].sock);
+		walkeeper[i].sock = PGINVALID_SOCKET;
+		walkeeper[i].state = SS_OFFLINE;
+
+		/* Postgres wait event set API doesn't support deletion of events, so we have to reconstruct set */
+		ResetWalProposerEventSet();
+	}
+
+	/* Try to establish new connection */
+	walkeeper[i].sock = ConnectSocketAsync(walkeeper[i].host, walkeeper[i].port, &established);
+	if (walkeeper[i].sock != PGINVALID_SOCKET)
+	{
+		elog(LOG, "%s with node %s:%s",
+					established ? "Connected" : "Connecting", walkeeper[i].host, walkeeper[i].port);
+
+
+		if (established)
+		{
+			/* Start handshake: first of all send information about server */
+			if (WriteSocket(walkeeper[i].sock, &serverInfo, sizeof serverInfo))
+			{
+				walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_READABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
+				walkeeper[i].state = SS_HANDSHAKE;
+				walkeeper[i].asyncOffs = 0;
+			}
+			else
+			{
+				ResetConnection(i);
+			}
+		}
+		else
+		{
+			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
+			walkeeper[i].state = SS_CONNECTING;
+		}
+	}
+}
+
+
+/*
+ * Calculate WAL position acknowledged by quorum
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(void)
+{
+	XLogRecPtr responses[MAX_WALKEEPERS];
+	/*
+	 * Sort acknowledged LSNs
+	 */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		responses[i] = walkeeper[i].feedback.epoch == prop.epoch
+			? walkeeper[i].feedback.flushLsn : prop.VCL;
+	}
+	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
+
+	/*
+	 * Get the smallest LSN committed by quorum
+	 */
+	return responses[n_walkeepers - quorum];
+}
+
+static void
+HandleWalKeeperResponse(void)
+{
+	HotStandbyFeedback hsFeedback;
+	XLogRecPtr minQuorumLsn;
+
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastFeedback.flushLsn)
+	{
+		lastFeedback.flushLsn = minQuorumLsn;
+		ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+	}
+	CombineHotStanbyFeedbacks(&hsFeedback);
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
+	{
+		lastFeedback.hs = hsFeedback;
+		ProcessStandbyHSFeedback(hsFeedback.ts,
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+	}
+
+
+	/* Cleanup message queue */
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1))
+	{
+		WalMessage* msg = msgQueueHead;
+		msgQueueHead = msg->next;
+		if (restartLsn < msg->req.beginLsn)
+			restartLsn = msg->req.endLsn;
+		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(WalKeeperRequest));
+		free(msg);
+	}
+	if (!msgQueueHead) /* queue is empty */
+		msgQueueTail = NULL;
+}
+
+char *zenith_timeline_walproposer = NULL;
+
+/*
+ * WAL proposer bgworeker entry point
+ */
+void
+WalProposerMain(Datum main_arg)
+{
+	char* host;
+	char* sep;
+	char* port;
+
+	/* Establish signal handlers. */
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+		/* Load the libpq-specific functions */
+	load_file("libpqwalreceiver", false);
+	if (WalReceiverFunctions == NULL)
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+
+	load_file("zenith", false);
+
+	BackgroundWorkerUnblockSignals();
+
+	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
+	{
+		port = strchr(host, ':');
+		if (port == NULL) {
+			elog(FATAL, "port is not specified");
+		}
+		*port++ = '\0';
+		sep = strchr(port, ',');
+		if (sep != NULL)
+			*sep++ = '\0';
+		if (n_walkeepers+1 >= MAX_WALKEEPERS)
+		{
+			elog(FATAL, "Too many walkeepers");
+		}
+		walkeeper[n_walkeepers].host = host;
+		walkeeper[n_walkeepers].port = port;
+		walkeeper[n_walkeepers].state = SS_OFFLINE;
+		walkeeper[n_walkeepers].sock = PGINVALID_SOCKET;
+		walkeeper[n_walkeepers].currMsg = NULL;
+		n_walkeepers += 1;
+	}
+	if (n_walkeepers < 1)
+	{
+		elog(FATAL, "WalKeepers addresses are not specified");
+	}
+	quorum = n_walkeepers/2 + 1;
+
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	/* Fill information about server */
+	serverInfo.timeline = ThisTimeLineID;
+	serverInfo.walEnd = GetFlushRecPtr();
+	serverInfo.walSegSize = wal_segment_size;
+	serverInfo.pgVersion = PG_VERSION_NUM;
+	if (!zenith_timeline_walproposer)
+		elog(FATAL, "zenith.zenith_timeline is not provided");
+	if (*zenith_timeline_walproposer != '\0' &&
+	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+	serverInfo.protocolVersion = SK_PROTOCOL_VERSION;
+	pg_strong_random(&serverInfo.nodeId.uuid, sizeof(serverInfo.nodeId.uuid));
+	serverInfo.systemId = GetSystemIdentifier();
+
+	last_reconnect_attempt = GetCurrentTimestamp();
+
+	application_name = (char *) "walproposer"; /* for synchronous_standby_names */
+	am_wal_proposer = true;
+	am_walsender = true;
+	InitWalSender();
+	ResetWalProposerEventSet();
+
+	/* Initiate connections to all walkeeper nodes */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		ResetConnection(i);
+	}
+
+	while (true)
+		WalProposerPoll();
+}
+
+static void
+WalProposerStartStreaming(XLogRecPtr startpos)
+{
+	StartReplicationCmd cmd;
+	/*
+	 * Always start streaming at the beginning of a segment
+	 */
+	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
+
+	cmd.slotname = NULL;
+	cmd.timeline = serverInfo.timeline;
+	cmd.startpoint = startpos;
+	StartReplication(&cmd);
+}
+
+/*
+ * Send message to the particular node
+ */
+static void
+SendMessageToNode(int i, WalMessage* msg)
+{
+	ssize_t rc;
+
+	/* If there is no pending message then send new one */
+	if (walkeeper[i].currMsg == NULL)
+	{
+		/* Skip already acknowledged messages */
+		while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
+			msg = msg->next;
+
+		walkeeper[i].currMsg = msg;
+	}
+	else
+		msg = walkeeper[i].currMsg;
+
+	if (msg != NULL)
+	{
+		msg->req.restartLsn = restartLsn;
+		msg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
+
+		elog(LOG, "sending message with len %ld VCL=%X/%X to %d",
+					msg->size - sizeof(WalKeeperRequest),
+					(uint32) (msg->req.commitLsn >> 32), (uint32) msg->req.commitLsn, i);
+
+		rc = WriteSocketAsync(walkeeper[i].sock, &msg->req, msg->size);
+		if (rc < 0)
+		{
+			ResetConnection(i);
+		}
+		else if ((size_t)rc == msg->size) /* message was completely sent */
+		{
+			walkeeper[i].asyncOffs = 0;
+			walkeeper[i].state = SS_RECV_FEEDBACK;
+		}
+		else
+		{
+			/* wait until socket is available for write */
+			walkeeper[i].state = SS_SEND_WAL;
+			walkeeper[i].asyncOffs = rc;
+			ModifyWaitEvent(waitEvents, walkeeper[i].eventPos, WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE, NULL);
+		}
+	}
+}
+
+/*
+ * Broadcast new message to all caught-up walkeepers
+ */
+static void
+BroadcastMessage(WalMessage* msg)
+{
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE && walkeeper[i].currMsg == NULL)
+		{
+			SendMessageToNode(i, msg);
+		}
+	}
+}
+
+static WalMessage*
+CreateMessage(XLogRecPtr startpos, char* data, int len)
+{
+	/* Create new message and append it to message queue */
+	WalMessage*	msg;
+	XLogRecPtr endpos;
+	len -= XLOG_HDR_SIZE;
+	endpos = startpos + len;
+	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
+	{
+		/* Message already queued */
+		return NULL;
+	}
+	Assert(len >= 0);
+	msg = (WalMessage*)malloc(sizeof(WalMessage) + len);
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
+
+	msg->size = sizeof(WalKeeperRequest) + len;
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.beginLsn = startpos;
+	msg->req.endLsn = endpos;
+	msg->req.senderId = prop.nodeId;
+	memcpy(&msg->req+1, data + XLOG_HDR_SIZE, len);
+
+	Assert(msg->req.endLsn >= lastSentLsn);
+	lastSentLsn = msg->req.endLsn;
+	return msg;
+}
+
+void
+WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
+{
+	WalMessage* msg = CreateMessage(startpos, data, len);
+	if (msg != NULL)
+		BroadcastMessage(msg);
+}
+
+/*
+ * Create WAL message with no data, just to let the walkeepers
+ * know that the VCL has advanced.
+ */
+static WalMessage*
+CreateMessageVCLOnly(void)
+{
+	/* Create new message and append it to message queue */
+	WalMessage*	msg;
+
+	if (lastSentLsn == 0)
+	{
+		/* FIXME: We haven't sent anything yet. Not sure what to do then.. */
+		return NULL;
+	}
+
+	msg = (WalMessage*)malloc(sizeof(WalMessage));
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
+
+	msg->size = sizeof(WalKeeperRequest);
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.beginLsn = lastSentLsn;
+	msg->req.endLsn = lastSentLsn;
+	msg->req.senderId = prop.nodeId;
+	/* restartLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
+	return msg;
+}
+
+
+/*
+ * Prepare vote request for election
+ */
+static void
+StartElection(void)
+{
+	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
+	XLogRecPtr initWALPos = serverInfo.walSegSize;
+	prop.VCL = restartLsn = initWALPos;
+	prop.nodeId = serverInfo.nodeId;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_VOTING)
+		{
+			prop.nodeId.term = Max(walkeeper[i].info.server.nodeId.term, prop.nodeId.term);
+			restartLsn = Max(walkeeper[i].info.restartLsn, restartLsn);
+			if (walkeeper[i].info.epoch > prop.epoch
+				|| (walkeeper[i].info.epoch == prop.epoch && walkeeper[i].info.flushLsn > prop.VCL))
+
+			{
+				prop.epoch = walkeeper[i].info.epoch;
+				prop.VCL = walkeeper[i].info.flushLsn;
+				leader = i;
+			}
+		}
+	}
+	/* Only walkeepers from most recent epoch can report it's FlushLsn to master */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_VOTING)
+		{
+			if (walkeeper[i].info.epoch == prop.epoch)
+			{
+				walkeeper[i].feedback.flushLsn = walkeeper[i].info.flushLsn;
+			}
+			else
+			{
+				elog(WARNING, "WalKeeper %s:%s belongs to old epoch " INT64_FORMAT " while current epoch is " INT64_FORMAT,
+					walkeeper[i].host,
+					walkeeper[i].port,
+					walkeeper[i].info.epoch,
+					prop.epoch);
+			}
+		}
+	}
+	prop.nodeId.term += 1;
+	prop.epoch += 1;
+}
+
+
+static void
+ReconnectWalKeepers(void)
+{
+	/* Initiate reconnect if timeout is expired */
+	TimestampTz now = GetCurrentTimestamp();
+	if (wal_acceptor_reconnect_timeout > 0 && now - last_reconnect_attempt > wal_acceptor_reconnect_timeout*1000)
+	{
+		last_reconnect_attempt = now;
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			if (walkeeper[i].state == SS_OFFLINE)
+				ResetConnection(i);
+		}
+	}
+}
+
+/*
+ * Receive WAL from most advanced WAL keeper
+ */
+static bool
+WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+{
+	char conninfo[MAXCONNINFO];
+	char *err;
+	WalReceiverConn *wrconn;
+	WalRcvStreamOptions options;
+
+	sprintf(conninfo, "host=%s port=%s dbname=replication",
+			walkeeper[leader].host, walkeeper[leader].port);
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+	if (!wrconn)
+	{
+		ereport(WARNING,
+				(errmsg("could not connect to WAL acceptor %s:%s: %s",
+						walkeeper[leader].host, walkeeper[leader].port,
+						err)));
+		return false;
+	}
+	elog(LOG, "Start recovery from %s:%s starting from %X/%08X till %X/%08X timeline %d",
+		 walkeeper[leader].host, walkeeper[leader].port,
+		 (uint32)(startpos>>32), (uint32)startpos, (uint32)(endpos >> 32), (uint32)endpos,
+		 timeline);
+
+	options.logical = false;
+	options.startpoint = startpos;
+	options.slotname = NULL;
+	options.proto.physical.startpointTLI = timeline;
+
+	if (walrcv_startstreaming(wrconn, &options))
+	{
+		XLogRecPtr rec_start_lsn;
+		XLogRecPtr rec_end_lsn;
+		int len;
+		char *buf;
+		pgsocket wait_fd = PGINVALID_SOCKET;
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) > 0)
+		{
+			Assert(buf[0] == 'w');
+			memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn);
+			rec_start_lsn = pg_ntoh64(rec_start_lsn);
+			rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+			(void)CreateMessage(rec_start_lsn, buf, len);
+			if (rec_end_lsn >= endpos)
+				break;
+		}
+		walrcv_endstreaming(wrconn, &timeline);
+		walrcv_disconnect(wrconn);
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
+						timeline, (uint32)(startpos >> 32), (uint32)startpos)));
+		return false;
+	}
+	/* Setup restart point for all walkeepers */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE)
+		{
+			for (WalMessage* msg = msgQueueHead; msg != NULL; msg = msg->next)
+			{
+				if (msg->req.endLsn <= walkeeper[i].info.flushLsn)
+				{
+					msg->ackMask |= 1 << i; /* message is already received by this walkeeper */
+				}
+				else
+				{
+					SendMessageToNode(i, msg);
+					break;
+				}
+			}
+		}
+	}
+	return true;
+}
+
+void
+WalProposerPoll(void)
+{
+	while (true)
+	{
+		WaitEvent	event;
+		int rc = WaitEventSetWait(waitEvents, -1, &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		WalKeeper*  wk = (WalKeeper*)event.user_data;
+		int i = (int)(wk - walkeeper);
+
+		/* If wait is terminated by error, postmaster die or latch event, then exit loop */
+		if (rc <= 0 || (event.events & (WL_POSTMASTER_DEATH|WL_LATCH_SET)) != 0)
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+
+		/* communication with walkeepers */
+		if (event.events & WL_SOCKET_READABLE)
+		{
+			switch (wk->state)
+			{
+				case SS_HANDSHAKE:
+					/* Receive walkeeper node state */
+					rc = ReadSocketAsync(wk->sock,
+										 (char*)&wk->info + wk->asyncOffs,
+										 sizeof(wk->info) - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == sizeof(wk->info))
+					{
+						/* WalKeeper response completely received */
+
+						/* Check protocol version */
+						if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
+						{
+							elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
+								wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
+							ResetConnection(i);
+						}
+						else
+						{
+							wk->state = SS_VOTING;
+							wk->feedback.flushLsn = restartLsn;
+							wk->feedback.hs.ts = 0;
+
+							/* Check if we have quorum */
+							if (++n_connected >= quorum)
+							{
+								if (n_connected == quorum)
+									StartElection();
+
+								/* Now send max-node-id to everyone participating in voting and wait their responses */
+								for (int j = 0; j < n_walkeepers; j++)
+								{
+									if (walkeeper[j].state == SS_VOTING)
+									{
+										if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
+										{
+											ResetConnection(j);
+										}
+										else
+										{
+											walkeeper[j].asyncOffs = 0;
+											walkeeper[j].state = SS_WAIT_VERDICT;
+										}
+									}
+								}
+							}
+						}
+					}
+					break;
+
+				case SS_WAIT_VERDICT:
+					/* Receive walkeeper response for our candidate */
+					rc = ReadSocketAsync(wk->sock,
+										 (char*)&wk->info.server.nodeId + wk->asyncOffs,
+										 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
+					{
+						/* Response completely received */
+
+						/* If server accept our candidate, then it returns it in response */
+						if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+						{
+							elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+								wk->host, wk->port,
+								wk->info.server.nodeId.term, prop.nodeId.term);
+						}
+						else
+						{
+							/* Handshake completed, do we have quorum? */
+							wk->state = SS_IDLE;
+							if (++n_votes == quorum)
+							{
+								elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
+									 quorum,
+									 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
+									);
+
+								/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+								if (restartLsn != prop.VCL)
+								{
+									/* Perform recovery */
+									if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+										elog(FATAL, "Failed to recover state");
+								}
+								WalProposerStartStreaming(prop.VCL);
+								/* Should not return here */
+							}
+							else
+							{
+								/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+								SendMessageToNode(i, msgQueueHead);
+							}
+						}
+					}
+					break;
+
+			    case SS_RECV_FEEDBACK:
+					/* Read walkeeper response with flushed WAL position */
+				    rc = ReadSocketAsync(wk->sock,
+										 (char*)&wk->feedback + wk->asyncOffs,
+										 sizeof(wk->feedback) - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
+					{
+						WalMessage* next = wk->currMsg->next;
+						Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
+						wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+						wk->state = SS_IDLE;
+						wk->asyncOffs = 0;
+						wk->currMsg = NULL;
+						HandleWalKeeperResponse();
+						SendMessageToNode(i, next);
+
+						/*
+						 * Also send the new VCL to all the walkeepers.
+						 *
+						 * FIXME: This is redundant for walkeepers that have other outbound messages
+						 * pending.
+						 */
+						if (true)
+						{
+							XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+							WalMessage *vclUpdateMsg;
+
+							if (minQuorumLsn > lastSentVCLLsn)
+							{
+								vclUpdateMsg = CreateMessageVCLOnly();
+								if (vclUpdateMsg)
+									BroadcastMessage(vclUpdateMsg);
+								lastSentVCLLsn = minQuorumLsn;
+							}
+						}
+					}
+					break;
+
+				case SS_IDLE:
+					elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
+					ResetConnection(i);
+					break;
+
+				default:
+		  			elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+			}
+		}
+		else if (event.events & WL_SOCKET_WRITEABLE)
+		{
+			switch (wk->state)
+			{
+				case SS_CONNECTING:
+				{
+					int			optval = 0;
+					ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
+					if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
+					{
+						elog(WARNING, "Failed to connect to node '%s:%s': %s",
+							 wk->host, wk->port,
+							 strerror(optval));
+						closesocket(wk->sock);
+						wk->sock =  PGINVALID_SOCKET;
+						wk->state = SS_OFFLINE;
+						ResetWalProposerEventSet();
+					}
+					else
+					{
+						uint32 len = 0;
+						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+						/*
+						 * Start handshake: send information about server.
+						 * First of all send 0 as package size: it allows walkeeper to distinguish
+						 * wal_proposer's connection from standard replication connection from pagers.
+						 */
+						if (WriteSocket(wk->sock, &len, sizeof len)
+							&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
+						{
+							wk->state = SS_HANDSHAKE;
+							wk->asyncOffs = 0;
+						}
+						else
+						{
+							ResetConnection(i);
+						}
+					}
+					break;
+				}
+
+				case SS_SEND_WAL:
+					rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == wk->currMsg->size)
+					{
+						/* WAL block completely sent */
+						wk->state = SS_RECV_FEEDBACK;
+						wk->asyncOffs = 0;
+						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+					}
+					break;
+
+				default:
+					elog(FATAL, "Unexpected write state %d", wk->state);
+			}
+		}
+		ReconnectWalKeepers();
+	}
+}
+
+
+/*
+ * WalProposerRegister
+ *		Register a background worker porposing WAL to wal acceptors
+ */
+void
+WalProposerRegister(void)
+{
+	BackgroundWorker bgw;
+
+	if (*wal_acceptors_list == '\0')
+		return;
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
new file mode 100644
index 00000000000..cea41ef01cc
--- /dev/null
+++ b/src/backend/replication/walproposer_utils.c
@@ -0,0 +1,237 @@
+#include "replication/walproposer.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+int CompareNodeId(NodeId* id1, NodeId* id2)
+{
+	return
+		(id1->term < id2->term)
+		? -1
+		: (id1->term > id2->term)
+		   ? 1
+   		   : memcmp(&id1->uuid, &id1->uuid, sizeof(pg_uuid_t));
+}
+
+int
+CompareLsn(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return -1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return 1;
+}
+
+static bool
+SetSocketOptions(pgsocket sock)
+{
+	int on = 1;
+	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+				   (char *) &on, sizeof(on)) < 0)
+	{
+		elog(WARNING, "setsockopt(TCP_NODELAY) failed: %m");
+		closesocket(sock);
+		return false;
+	}
+	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				   (char *) &on, sizeof(on)) < 0)
+	{
+		elog(WARNING, "setsockopt(SO_REUSEADDR) failed: %m");
+		closesocket(sock);
+		return false;
+	}
+	if (!pg_set_noblock(sock))
+	{
+		elog(WARNING, "faied to switch socket to non-blocking mode: %m");
+		closesocket(sock);
+		return false;
+	}
+	return true;
+}
+
+pgsocket
+ConnectSocketAsync(char const* host, char const* port, bool* established)
+{
+	struct addrinfo *addrs = NULL,
+		*addr,
+		hints;
+	int	ret;
+	pgsocket sock = PGINVALID_SOCKET;
+
+	hints.ai_flags = AI_PASSIVE;
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_protocol = 0;
+	hints.ai_addrlen = 0;
+	hints.ai_addr = NULL;
+	hints.ai_canonname = NULL;
+	hints.ai_next = NULL;
+	ret = pg_getaddrinfo_all(host, port, &hints, &addrs);
+	if (ret || !addrs)
+	{
+		elog(WARNING, "Could not resolve \"%s\": %s",
+					 host, gai_strerror(ret));
+		return -1;
+	}
+	for (addr = addrs; addr; addr = addr->ai_next)
+	{
+		sock = socket(addr->ai_family, SOCK_STREAM, 0);
+		if (sock == PGINVALID_SOCKET)
+		{
+			elog(WARNING, "could not create socket: %m");
+			continue;
+		}
+		if (!SetSocketOptions(sock))
+			continue;
+
+		/*
+		 * Bind it to a kernel assigned port on localhost and get the assigned
+		 * port via getsockname().
+		 */
+		while ((ret = connect(sock, addr->ai_addr, addr->ai_addrlen)) < 0 && errno == EINTR);
+		if (ret < 0)
+		{
+			if (errno == EINPROGRESS)
+			{
+				*established = false;
+				break;
+			}
+			elog(WARNING, "Could not establish connection to %s:%s: %m",
+						 host, port);
+			closesocket(sock);
+		}
+		else
+		{
+			*established = true;
+			break;
+		}
+	}
+	return sock;
+}
+ssize_t
+ReadSocketAsync(pgsocket sock, void* buf, size_t size)
+{
+	size_t offs = 0;
+
+	while (size != offs)
+	{
+		ssize_t rc = recv(sock, (char*)buf + offs, size - offs, 0);
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				continue;
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				return offs;
+			elog(WARNING, "Socket write failed: %m");
+			return -1;
+		}
+		else if (rc == 0)
+		{
+			elog(WARNING, "Connection was closed by peer");
+			return -1;
+		}
+		offs += rc;
+	}
+	return offs;
+}
+
+ssize_t
+WriteSocketAsync(pgsocket sock, void const* buf, size_t size)
+{
+	size_t offs = 0;
+
+	while (size != offs)
+	{
+		ssize_t rc = send(sock, (char const*)buf + offs, size - offs, 0);
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				continue;
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				return offs;
+			elog(WARNING, "Socket write failed: %m");
+			return -1;
+		}
+		else if (rc == 0)
+		{
+			elog(WARNING, "Connection was closed by peer");
+			return -1;
+		}
+		offs += rc;
+	}
+	return offs;
+}
+
+bool
+WriteSocket(pgsocket sock, void const* buf, size_t size)
+{
+	char* src = (char*)buf;
+
+	while (size != 0)
+	{
+		ssize_t rc = send(sock, src, size, 0);
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				continue;
+			elog(WARNING, "Socket write failed: %m");
+			return false;
+		}
+		else if (rc == 0)
+		{
+			elog(WARNING, "Connection was closed by peer");
+			return false;
+		}
+		size -= rc;
+		src += rc;
+	}
+	return true;
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 28f0a294736..20a38385a43 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -73,6 +73,7 @@
 #include "replication/slot.h"
 #include "replication/snapbuild.h"
 #include "replication/syncrep.h"
+#include "replication/walproposer.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
@@ -234,7 +235,7 @@ static XLogRecPtr GetStandbyFlushRecPtr(void);
 static void IdentifySystem(void);
 static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd);
 static void DropReplicationSlot(DropReplicationSlotCmd *cmd);
-static void StartReplication(StartReplicationCmd *cmd);
+void StartReplication(StartReplicationCmd *cmd);
 static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
@@ -568,7 +569,7 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd)
  * At the moment, this never returns, but an ereport(ERROR) will take us back
  * to the main loop.
  */
-static void
+void
 StartReplication(StartReplicationCmd *cmd)
 {
 	StringInfoData buf;
@@ -709,11 +710,14 @@ StartReplication(StartReplicationCmd *cmd)
 		WalSndSetState(WALSNDSTATE_CATCHUP);
 
 		/* Send a CopyBothResponse message, and start streaming */
-		pq_beginmessage(&buf, 'W');
-		pq_sendbyte(&buf, 0);
-		pq_sendint16(&buf, 0);
-		pq_endmessage(&buf);
-		pq_flush();
+		if (!am_wal_proposer)
+		{
+			pq_beginmessage(&buf, 'W');
+			pq_sendbyte(&buf, 0);
+			pq_sendint16(&buf, 0);
+			pq_endmessage(&buf);
+			pq_flush();
+		}
 
 		/*
 		 * Don't allow a request to stream from a future point in WAL that
@@ -1335,7 +1339,7 @@ ProcessPendingWrites(void)
 		}
 
 		/* Try to flush pending output to the client */
-		if (pq_flush_if_writable() != 0)
+		if (!am_wal_proposer && pq_flush_if_writable() != 0)
 			WalSndShutdown();
 	}
 
@@ -1744,6 +1748,9 @@ ProcessRepliesIfAny(void)
 	int			r;
 	bool		received = false;
 
+	if (am_wal_proposer)
+		return;
+
 	last_processing = GetCurrentTimestamp();
 
 	/*
@@ -1919,21 +1926,34 @@ ProcessStandbyReplyMessage(void)
 				flushPtr,
 				applyPtr;
 	bool		replyRequested;
-	TimeOffset	writeLag,
-				flushLag,
-				applyLag;
-	bool		clearLagTimes;
-	TimestampTz now;
 	TimestampTz replyTime;
 
-	static bool fullyAppliedLastTime = false;
-
 	/* the caller already consumed the msgtype byte */
 	writePtr = pq_getmsgint64(&reply_message);
 	flushPtr = pq_getmsgint64(&reply_message);
 	applyPtr = pq_getmsgint64(&reply_message);
 	replyTime = pq_getmsgint64(&reply_message);
 	replyRequested = pq_getmsgbyte(&reply_message);
+	ProcessStandbyReply(writePtr,
+						flushPtr,
+						applyPtr,
+						replyTime,
+						replyRequested);
+}
+
+void
+ProcessStandbyReply(XLogRecPtr	writePtr,
+					XLogRecPtr	flushPtr,
+					XLogRecPtr	applyPtr,
+					TimestampTz replyTime,
+					bool		replyRequested)
+{
+	TimeOffset	writeLag,
+				flushLag,
+				applyLag;
+	bool		clearLagTimes;
+	TimestampTz now;
+	static bool fullyAppliedLastTime = false;
 
 	if (message_level_is_interesting(DEBUG2))
 	{
@@ -2116,7 +2136,16 @@ ProcessStandbyHSFeedbackMessage(void)
 	feedbackEpoch = pq_getmsgint(&reply_message, 4);
 	feedbackCatalogXmin = pq_getmsgint(&reply_message, 4);
 	feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4);
+	ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch);
+}
 
+void
+ProcessStandbyHSFeedback(TimestampTz   replyTime,
+						 TransactionId feedbackXmin,
+						 uint32		feedbackEpoch,
+						 TransactionId feedbackCatalogXmin,
+						 uint32		feedbackCatalogEpoch)
+{
 	if (message_level_is_interesting(DEBUG2))
 	{
 		char	   *replyTimeStr;
@@ -2324,6 +2353,19 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		/* Check for input from the client */
 		ProcessRepliesIfAny();
 
+		if (am_wal_proposer)
+		{
+			send_data();
+			if (WalSndCaughtUp)
+			{
+				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+					WalSndSetState(WALSNDSTATE_STREAMING);
+				WalProposerPoll();
+				WalSndCaughtUp = false;
+			}
+			continue;
+		}
+
 		/*
 		 * If we have received CopyDone from the client, sent CopyDone
 		 * ourselves, and the output buffer is empty, it's time to exit
@@ -2785,9 +2827,12 @@ XLogSendPhysical(void)
 	/*
 	 * OK to read and send the slice.
 	 */
-	resetStringInfo(&output_message);
-	pq_sendbyte(&output_message, 'w');
+	if (output_message.data)
+		resetStringInfo(&output_message);
+	else
+		initStringInfo(&output_message);
 
+	pq_sendbyte(&output_message, 'w');
 	pq_sendint64(&output_message, startptr);	/* dataStart */
 	pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
 	pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
@@ -2840,16 +2885,22 @@ XLogSendPhysical(void)
 	output_message.len += nbytes;
 	output_message.data[output_message.len] = '\0';
 
-	/*
-	 * Fill the send timestamp last, so that it is taken as late as possible.
-	 */
-	resetStringInfo(&tmpbuf);
-	pq_sendint64(&tmpbuf, GetCurrentTimestamp());
-	memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
-		   tmpbuf.data, sizeof(int64));
-
-	pq_putmessage_noblock('d', output_message.data, output_message.len);
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, output_message.data, output_message.len);
+	}
+	else
+	{
+		/*
+		 * Fill the send timestamp last, so that it is taken as late as possible.
+		 */
+		resetStringInfo(&tmpbuf);
+		pq_sendint64(&tmpbuf, GetCurrentTimestamp());
+		memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
+			   tmpbuf.data, sizeof(int64));
 
+		pq_putmessage_noblock('d', output_message.data, output_message.len);
+	}
 	sentPtr = endptr;
 
 	/* Update shared memory status */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 767ef918c76..3f663c2b65f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -80,6 +80,7 @@
 #include "replication/syncrep.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "replication/walproposer.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
@@ -184,6 +185,7 @@ static int	syslog_facility = 0;
 static void assign_syslog_facility(int newval, void *extra);
 static void assign_syslog_ident(const char *newval, void *extra);
 static void assign_session_replication_role(int newval, void *extra);
+
 static bool check_temp_buffers(int *newval, void **extra, GucSource source);
 static bool check_bonjour(bool *newval, void **extra, GucSource source);
 static bool check_ssl(bool *newval, void **extra, GucSource source);
@@ -2297,6 +2299,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"wal_acceptor_reconnect", PGC_SIGHUP, REPLICATION_STANDBY,
+			gettext_noop("Timeout for reconnecting to offline wal acceptor."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&wal_acceptor_reconnect_timeout,
+		1000, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the maximum number of concurrent connections."),
@@ -4601,6 +4614,17 @@ static struct config_string ConfigureNamesString[] =
 		check_backtrace_functions, assign_backtrace_functions, NULL
 	},
 
+	{
+		{"wal_acceptors", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("List of Zenith WAL acceptors (host:port)"),
+			NULL,
+			GUC_LIST_INPUT | GUC_LIST_QUOTE
+		},
+		&wal_acceptors_list,
+		"",
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
@@ -11777,6 +11801,7 @@ assign_session_replication_role(int newval, void *extra)
 		ResetPlanCache();
 }
 
+
 static bool
 check_temp_buffers(int *newval, void **extra, GucSource source)
 {
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
new file mode 100644
index 00000000000..e1845e3fb19
--- /dev/null
+++ b/src/include/replication/walproposer.h
@@ -0,0 +1,174 @@
+#ifndef __WALKEEPER_H__
+#define __WALKEEPER_H__
+
+#include "postgres.h"
+#include "access/xlog_internal.h"
+#include "access/transam.h"
+#include "nodes/replnodes.h"
+#include "utils/uuid.h"
+
+#define SK_MAGIC              0xCafeCeefu
+#define SK_PROTOCOL_VERSION   1
+
+#define MAX_WALKEEPERS        32
+#define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
+#define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
+#define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
+
+extern char* wal_acceptors_list;
+extern int   wal_acceptor_reconnect_timeout;
+extern bool  am_wal_proposer;
+
+struct WalMessage;
+typedef struct WalMessage WalMessage;
+
+extern char *zenith_timeline_walproposer;
+
+/* WAL safekeeper state */
+typedef enum
+{
+	SS_OFFLINE,
+	SS_CONNECTING,
+	SS_HANDSHAKE,
+	SS_VOTING,
+	SS_WAIT_VERDICT,
+	SS_IDLE,
+	SS_SEND_WAL,
+	SS_RECV_FEEDBACK
+} WalKeeperState;
+
+/*
+ * Unique node identifier used by Paxos
+ */
+typedef struct NodeId
+{
+	uint64     term;
+	pg_uuid_t  uuid;
+} NodeId;
+
+/*
+ * Information about Postgres server broadcasted by WAL proposer to walkeeper
+ */
+typedef struct ServerInfo
+{
+	uint32     protocolVersion;   /* proposer-walkeeper protocol version */
+	uint32     pgVersion;         /* Postgres server version */
+	NodeId     nodeId;
+	uint64     systemId;          /* Postgres system identifier */
+	uint8	   ztimelineid[16];   /* Zenith timeline id */
+	XLogRecPtr walEnd;
+    TimeLineID timeline;
+	int        walSegSize;
+} ServerInfo;
+
+/*
+ * Vote request sent from proposer to walkeepers
+ */
+typedef struct RequestVote
+{
+	NodeId     nodeId;
+	XLogRecPtr VCL;   /* volume commit LSN */
+	uint64     epoch; /* new epoch when walkeeper reaches VCL */
+} RequestVote;
+
+/*
+ * Information of about storage node
+ */
+typedef struct WalKeeperInfo
+{
+	uint32     magic;             /* magic for verifying content the control file */
+	uint32     formatVersion;     /* walkeeper format version */
+	uint64     epoch;             /* walkeeper's epoch */
+	ServerInfo server;
+	XLogRecPtr commitLsn;         /* part of WAL acknowledged by quorum */
+	XLogRecPtr flushLsn;          /* locally flushed part of WAL */
+	XLogRecPtr restartLsn;        /* minimal LSN which may be needed for recovery of some walkeeper: min(commitLsn) for all walkeepers */
+} WalKeeperInfo;
+
+/*
+ * Hot standby feedback received from replica
+ */
+typedef struct HotStandbyFeedback
+{
+	TimestampTz       ts;
+	FullTransactionId xmin;
+	FullTransactionId catalog_xmin;
+} HotStandbyFeedback;
+
+
+/*
+ * Request with WAL message sent from proposer to walkeeper.
+ */
+typedef struct WalKeeperRequest
+{
+	NodeId     senderId;    /* Sender's node identifier (looks like we do not need it for TCP streaming connection) */
+	XLogRecPtr beginLsn;    /* start position of message in WAL */
+	XLogRecPtr endLsn;      /* end position of message in WAL */
+	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
+	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
+} WalKeeperRequest;
+
+/*
+ * All copy data message ('w') are linked in L1 send list and asynchronously sent to receivers.
+ * When message is sent to all receivers, it is removed from send list.
+ */
+struct WalMessage
+{
+	WalMessage* next;      /* L1 list of messages */
+	uint32 size;           /* message size */
+	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
+	WalKeeperRequest req; /* request to walkeeper (message header) */
+};
+
+/*
+ * Report walkeeper state to proposer
+ */
+typedef struct WalKeeperResponse
+{
+	uint64     epoch;
+	XLogRecPtr flushLsn;
+	HotStandbyFeedback hs;
+} WalKeeperResponse;
+
+
+/*
+ * Descriptor of walkeeper
+ */
+typedef struct WalKeeper
+{
+    char const* host;
+    char const* port;
+	pgsocket    sock;     /* socket descriptor */
+	WalMessage* currMsg;  /* message been send to the receiver */
+	int         asyncOffs;/* offset for asynchronus read/write operations */
+	int         eventPos; /* position in wait event set */
+	WalKeeperState state;/* walkeeper state machine state */
+    WalKeeperInfo  info; /* walkeeper info */
+	WalKeeperResponse feedback; /* feedback to master */
+} WalKeeper;
+
+
+int        CompareNodeId(NodeId* id1, NodeId* id2);
+pgsocket   ConnectSocketAsync(char const* host, char const* port, bool* established);
+bool       WriteSocket(pgsocket sock, void const* buf, size_t size);
+ssize_t    ReadSocketAsync(pgsocket sock, void* buf, size_t size);
+ssize_t    WriteSocketAsync(pgsocket sock, void const* buf, size_t size);
+int        CompareLsn(const void *a, const void *b);
+void       WalProposerMain(Datum main_arg);
+void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
+bool       HexDecodeString(uint8 *result, char *input, int nbytes);
+void       WalProposerPoll(void);
+void       WalProposerRegister(void);
+void       ProcessStandbyReply(XLogRecPtr	writePtr,
+							   XLogRecPtr	flushPtr,
+							   XLogRecPtr	applyPtr,
+							   TimestampTz replyTime,
+							   bool		replyRequested);
+void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
+									TransactionId feedbackXmin,
+									uint32		feedbackEpoch,
+									TransactionId feedbackCatalogXmin,
+									uint32		feedbackCatalogEpoch);
+void       StartReplication(StartReplicationCmd *cmd);
+
+#endif

From 94b84ef103186d3a3130fef5cc225b40b2c7ea6f Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:41:11 +0300
Subject: [PATCH 008/214] persist_unlogged_tables.patch

Ignore unlogged table qualifier. Add respective changes to regression test outputs.

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/commands/tablecmds.c             |    6 +
 src/test/regress/expected/alter_table_1.out  | 4487 ++++++++++++++++++
 src/test/regress/expected/create_table_1.out | 1315 +++++
 3 files changed, 5808 insertions(+)
 create mode 100644 src/test/regress/expected/alter_table_1.out
 create mode 100644 src/test/regress/expected/create_table_1.out

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 6158ddddbcd..bb0ea30dcd0 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -700,6 +700,12 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("cannot create temporary table within security-restricted operation")));
 
+	if (stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED)
+	{
+		/* Unlogged tables are not supported by Zenith */
+		stmt->relation->relpersistence = RELPERSISTENCE_PERMANENT;
+	}
+
 	/*
 	 * Determine the lockmode to use when scanning parents.  A self-exclusive
 	 * lock is needed here.
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
new file mode 100644
index 00000000000..0f116d8750d
--- /dev/null
+++ b/src/test/regress/expected/alter_table_1.out
@@ -0,0 +1,4487 @@
+--
+-- ALTER_TABLE
+--
+-- Clean up in case a prior regression run failed
+SET client_min_messages TO 'warning';
+DROP ROLE IF EXISTS regress_alter_table_user1;
+RESET client_min_messages;
+CREATE USER regress_alter_table_user1;
+--
+-- add attribute
+--
+CREATE TABLE attmp (initial int4);
+COMMENT ON TABLE attmp_wrong IS 'table comment';
+ERROR:  relation "attmp_wrong" does not exist
+COMMENT ON TABLE attmp IS 'table comment';
+COMMENT ON TABLE attmp IS NULL;
+ALTER TABLE attmp ADD COLUMN xmin integer; -- fails
+ERROR:  column name "xmin" conflicts with a system column name
+ALTER TABLE attmp ADD COLUMN a int4 default 3;
+ALTER TABLE attmp ADD COLUMN b name;
+ALTER TABLE attmp ADD COLUMN c text;
+ALTER TABLE attmp ADD COLUMN d float8;
+ALTER TABLE attmp ADD COLUMN e float4;
+ALTER TABLE attmp ADD COLUMN f int2;
+ALTER TABLE attmp ADD COLUMN g polygon;
+ALTER TABLE attmp ADD COLUMN i char;
+ALTER TABLE attmp ADD COLUMN k int4;
+ALTER TABLE attmp ADD COLUMN l tid;
+ALTER TABLE attmp ADD COLUMN m xid;
+ALTER TABLE attmp ADD COLUMN n oidvector;
+--ALTER TABLE attmp ADD COLUMN o lock;
+ALTER TABLE attmp ADD COLUMN p boolean;
+ALTER TABLE attmp ADD COLUMN q point;
+ALTER TABLE attmp ADD COLUMN r lseg;
+ALTER TABLE attmp ADD COLUMN s path;
+ALTER TABLE attmp ADD COLUMN t box;
+ALTER TABLE attmp ADD COLUMN v timestamp;
+ALTER TABLE attmp ADD COLUMN w interval;
+ALTER TABLE attmp ADD COLUMN x float8[];
+ALTER TABLE attmp ADD COLUMN y float4[];
+ALTER TABLE attmp ADD COLUMN z int2[];
+INSERT INTO attmp (a, b, c, d, e, f, g,    i,    k, l, m, n, p, q, r, s, t,
+	v, w, x, y, z)
+   VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)',
+	'c',
+	314159, '(1,1)', '512',
+	'1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)',
+	'(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)',
+	'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}');
+SELECT * FROM attmp;
+ initial | a |  b   |  c   |  d  |  e  | f |           g           | i |   k    |   l   |  m  |        n        | p |     q     |           r           |              s              |          t          |            v             |        w         |     x     |     y     |     z     
+---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+-----------
+         | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4}
+(1 row)
+
+DROP TABLE attmp;
+-- the wolf bug - schema mods caused inconsistent row descriptors
+CREATE TABLE attmp (
+	initial 	int4
+);
+ALTER TABLE attmp ADD COLUMN a int4;
+ALTER TABLE attmp ADD COLUMN b name;
+ALTER TABLE attmp ADD COLUMN c text;
+ALTER TABLE attmp ADD COLUMN d float8;
+ALTER TABLE attmp ADD COLUMN e float4;
+ALTER TABLE attmp ADD COLUMN f int2;
+ALTER TABLE attmp ADD COLUMN g polygon;
+ALTER TABLE attmp ADD COLUMN i char;
+ALTER TABLE attmp ADD COLUMN k int4;
+ALTER TABLE attmp ADD COLUMN l tid;
+ALTER TABLE attmp ADD COLUMN m xid;
+ALTER TABLE attmp ADD COLUMN n oidvector;
+--ALTER TABLE attmp ADD COLUMN o lock;
+ALTER TABLE attmp ADD COLUMN p boolean;
+ALTER TABLE attmp ADD COLUMN q point;
+ALTER TABLE attmp ADD COLUMN r lseg;
+ALTER TABLE attmp ADD COLUMN s path;
+ALTER TABLE attmp ADD COLUMN t box;
+ALTER TABLE attmp ADD COLUMN v timestamp;
+ALTER TABLE attmp ADD COLUMN w interval;
+ALTER TABLE attmp ADD COLUMN x float8[];
+ALTER TABLE attmp ADD COLUMN y float4[];
+ALTER TABLE attmp ADD COLUMN z int2[];
+INSERT INTO attmp (a, b, c, d, e, f, g,    i,   k, l, m, n, p, q, r, s, t,
+	v, w, x, y, z)
+   VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)',
+        'c',
+	314159, '(1,1)', '512',
+	'1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)',
+	'(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)',
+	'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}');
+SELECT * FROM attmp;
+ initial | a |  b   |  c   |  d  |  e  | f |           g           | i |   k    |   l   |  m  |        n        | p |     q     |           r           |              s              |          t          |            v             |        w         |     x     |     y     |     z     
+---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+-----------
+         | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4}
+(1 row)
+
+CREATE INDEX attmp_idx ON attmp (a, (d + e), b);
+ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000;
+ERROR:  column number must be in range from 1 to 32767
+LINE 1: ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000;
+                                           ^
+ALTER INDEX attmp_idx ALTER COLUMN 1 SET STATISTICS 1000;
+ERROR:  cannot alter statistics on non-expression column "a" of index "attmp_idx"
+HINT:  Alter statistics on table column instead.
+ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS 1000;
+\d+ attmp_idx
+                        Index "public.attmp_idx"
+ Column |       Type       | Key? | Definition | Storage | Stats target 
+--------+------------------+------+------------+---------+--------------
+ a      | integer          | yes  | a          | plain   | 
+ expr   | double precision | yes  | (d + e)    | plain   | 1000
+ b      | cstring          | yes  | b          | plain   | 
+btree, for table "public.attmp"
+
+ALTER INDEX attmp_idx ALTER COLUMN 3 SET STATISTICS 1000;
+ERROR:  cannot alter statistics on non-expression column "b" of index "attmp_idx"
+HINT:  Alter statistics on table column instead.
+ALTER INDEX attmp_idx ALTER COLUMN 4 SET STATISTICS 1000;
+ERROR:  column number 4 of relation "attmp_idx" does not exist
+ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS -1;
+DROP TABLE attmp;
+--
+-- rename - check on both non-temp and temp tables
+--
+CREATE TABLE attmp (regtable int);
+CREATE TEMP TABLE attmp (attmptable int);
+ALTER TABLE attmp RENAME TO attmp_new;
+SELECT * FROM attmp;
+ regtable 
+----------
+(0 rows)
+
+SELECT * FROM attmp_new;
+ attmptable 
+------------
+(0 rows)
+
+ALTER TABLE attmp RENAME TO attmp_new2;
+SELECT * FROM attmp;		-- should fail
+ERROR:  relation "attmp" does not exist
+LINE 1: SELECT * FROM attmp;
+                      ^
+SELECT * FROM attmp_new;
+ attmptable 
+------------
+(0 rows)
+
+SELECT * FROM attmp_new2;
+ regtable 
+----------
+(0 rows)
+
+DROP TABLE attmp_new;
+DROP TABLE attmp_new2;
+-- check rename of partitioned tables and indexes also
+CREATE TABLE part_attmp (a int primary key) partition by range (a);
+CREATE TABLE part_attmp1 PARTITION OF part_attmp FOR VALUES FROM (0) TO (100);
+ALTER INDEX part_attmp_pkey RENAME TO part_attmp_index;
+ALTER INDEX part_attmp1_pkey RENAME TO part_attmp1_index;
+ALTER TABLE part_attmp RENAME TO part_at2tmp;
+ALTER TABLE part_attmp1 RENAME TO part_at2tmp1;
+SET ROLE regress_alter_table_user1;
+ALTER INDEX part_attmp_index RENAME TO fail;
+ERROR:  must be owner of index part_attmp_index
+ALTER INDEX part_attmp1_index RENAME TO fail;
+ERROR:  must be owner of index part_attmp1_index
+ALTER TABLE part_at2tmp RENAME TO fail;
+ERROR:  must be owner of table part_at2tmp
+ALTER TABLE part_at2tmp1 RENAME TO fail;
+ERROR:  must be owner of table part_at2tmp1
+RESET ROLE;
+DROP TABLE part_at2tmp;
+--
+-- check renaming to a table's array type's autogenerated name
+-- (the array type's name should get out of the way)
+--
+CREATE TABLE attmp_array (id int);
+CREATE TABLE attmp_array2 (id int);
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+   typname    
+--------------
+ _attmp_array
+(1 row)
+
+SELECT typname FROM pg_type WHERE oid = 'attmp_array2[]'::regtype;
+    typname    
+---------------
+ _attmp_array2
+(1 row)
+
+ALTER TABLE attmp_array2 RENAME TO _attmp_array;
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+    typname    
+---------------
+ __attmp_array
+(1 row)
+
+SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype;
+    typname     
+----------------
+ ___attmp_array
+(1 row)
+
+DROP TABLE _attmp_array;
+DROP TABLE attmp_array;
+-- renaming to table's own array type's name is an interesting corner case
+CREATE TABLE attmp_array (id int);
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+   typname    
+--------------
+ _attmp_array
+(1 row)
+
+ALTER TABLE attmp_array RENAME TO _attmp_array;
+SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype;
+    typname    
+---------------
+ __attmp_array
+(1 row)
+
+DROP TABLE _attmp_array;
+-- ALTER TABLE ... RENAME on non-table relations
+-- renaming indexes (FIXME: this should probably test the index's functionality)
+ALTER INDEX IF EXISTS __onek_unique1 RENAME TO attmp_onek_unique1;
+NOTICE:  relation "__onek_unique1" does not exist, skipping
+ALTER INDEX IF EXISTS __attmp_onek_unique1 RENAME TO onek_unique1;
+NOTICE:  relation "__attmp_onek_unique1" does not exist, skipping
+ALTER INDEX onek_unique1 RENAME TO attmp_onek_unique1;
+ALTER INDEX attmp_onek_unique1 RENAME TO onek_unique1;
+SET ROLE regress_alter_table_user1;
+ALTER INDEX onek_unique1 RENAME TO fail;  -- permission denied
+ERROR:  must be owner of index onek_unique1
+RESET ROLE;
+-- renaming views
+CREATE VIEW attmp_view (unique1) AS SELECT unique1 FROM tenk1;
+ALTER TABLE attmp_view RENAME TO attmp_view_new;
+SET ROLE regress_alter_table_user1;
+ALTER VIEW attmp_view_new RENAME TO fail;  -- permission denied
+ERROR:  must be owner of view attmp_view_new
+RESET ROLE;
+-- hack to ensure we get an indexscan here
+set enable_seqscan to off;
+set enable_bitmapscan to off;
+-- 5 values, sorted
+SELECT unique1 FROM tenk1 WHERE unique1 < 5;
+ unique1 
+---------
+       0
+       1
+       2
+       3
+       4
+(5 rows)
+
+reset enable_seqscan;
+reset enable_bitmapscan;
+DROP VIEW attmp_view_new;
+-- toast-like relation name
+alter table stud_emp rename to pg_toast_stud_emp;
+alter table pg_toast_stud_emp rename to stud_emp;
+-- renaming index should rename constraint as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraint
+ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0);
+ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo;
+-- renaming constraint should rename index as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+DROP INDEX onek_unique1_constraint;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint on table onek instead.
+ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo;
+DROP INDEX onek_unique1_constraint_foo;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint_foo on table onek instead.
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraints vs. inheritance
+CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int);
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1" CHECK (a > 0)
+
+CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging constraint "con1" with inherited definition
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  cannot rename inherited constraint "con1"
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  inherited constraint "con1" must be renamed in child tables too
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0) NO INHERIT;
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+    "con2bar" CHECK (b > 0) NO INHERIT
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a);
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "con3foo" PRIMARY KEY, btree (a)
+Check constraints:
+    "con1foo" CHECK (a > 0)
+    "con2bar" CHECK (b > 0) NO INHERIT
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+DROP TABLE constraint_rename_test2;
+DROP TABLE constraint_rename_test;
+ALTER TABLE IF EXISTS constraint_not_exist RENAME CONSTRAINT con3 TO con3foo; -- ok
+NOTICE:  relation "constraint_not_exist" does not exist, skipping
+ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a);
+NOTICE:  relation "constraint_rename_test" does not exist, skipping
+-- renaming constraints with cache reset of target relation
+CREATE TABLE constraint_rename_cache (a int,
+  CONSTRAINT chk_a CHECK (a > 0),
+  PRIMARY KEY (a));
+ALTER TABLE constraint_rename_cache
+  RENAME CONSTRAINT chk_a TO chk_a_new;
+ALTER TABLE constraint_rename_cache
+  RENAME CONSTRAINT constraint_rename_cache_pkey TO constraint_rename_pkey_new;
+CREATE TABLE like_constraint_rename_cache
+  (LIKE constraint_rename_cache INCLUDING ALL);
+\d like_constraint_rename_cache
+    Table "public.like_constraint_rename_cache"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+Indexes:
+    "like_constraint_rename_cache_pkey" PRIMARY KEY, btree (a)
+Check constraints:
+    "chk_a_new" CHECK (a > 0)
+
+DROP TABLE constraint_rename_cache;
+DROP TABLE like_constraint_rename_cache;
+-- FOREIGN KEY CONSTRAINT adding TEST
+CREATE TABLE attmp2 (a int primary key);
+CREATE TABLE attmp3 (a int, b int);
+CREATE TABLE attmp4 (a int, b int, unique(a,b));
+CREATE TABLE attmp5 (a int, b int);
+-- Insert rows into attmp2 (pktable)
+INSERT INTO attmp2 values (1);
+INSERT INTO attmp2 values (2);
+INSERT INTO attmp2 values (3);
+INSERT INTO attmp2 values (4);
+-- Insert rows into attmp3
+INSERT INTO attmp3 values (1,10);
+INSERT INTO attmp3 values (1,20);
+INSERT INTO attmp3 values (5,50);
+-- Try (and fail) to add constraint due to invalid source columns
+ALTER TABLE attmp3 add constraint attmpconstr foreign key(c) references attmp2 match full;
+ERROR:  column "c" referenced in foreign key constraint does not exist
+-- Try (and fail) to add constraint due to invalid destination columns explicitly given
+ALTER TABLE attmp3 add constraint attmpconstr foreign key(a) references attmp2(b) match full;
+ERROR:  column "b" referenced in foreign key constraint does not exist
+-- Try (and fail) to add constraint due to invalid data
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full;
+ERROR:  insert or update on table "attmp3" violates foreign key constraint "attmpconstr"
+DETAIL:  Key (a)=(5) is not present in table "attmp2".
+-- Delete failing row
+DELETE FROM attmp3 where a=5;
+-- Try (and succeed)
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full;
+ALTER TABLE attmp3 drop constraint attmpconstr;
+INSERT INTO attmp3 values (5,50);
+-- Try NOT VALID and then VALIDATE CONSTRAINT, but fails. Delete failure then re-validate
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full NOT VALID;
+ALTER TABLE attmp3 validate constraint attmpconstr;
+ERROR:  insert or update on table "attmp3" violates foreign key constraint "attmpconstr"
+DETAIL:  Key (a)=(5) is not present in table "attmp2".
+-- Delete failing row
+DELETE FROM attmp3 where a=5;
+-- Try (and succeed) and repeat to show it works on already valid constraint
+ALTER TABLE attmp3 validate constraint attmpconstr;
+ALTER TABLE attmp3 validate constraint attmpconstr;
+-- Try a non-verified CHECK constraint
+ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10); -- fail
+ERROR:  check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row
+ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10) NOT VALID; -- succeeds
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- fails
+ERROR:  check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row
+DELETE FROM attmp3 WHERE NOT b > 10;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds
+-- Test inherited NOT VALID CHECK constraints
+select * from attmp3;
+ a | b  
+---+----
+ 1 | 20
+(1 row)
+
+CREATE TABLE attmp6 () INHERITS (attmp3);
+CREATE TABLE attmp7 () INHERITS (attmp3);
+INSERT INTO attmp6 VALUES (6, 30), (7, 16);
+ALTER TABLE attmp3 ADD CONSTRAINT b_le_20 CHECK (b <= 20) NOT VALID;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20;	-- fails
+ERROR:  check constraint "b_le_20" of relation "attmp6" is violated by some row
+DELETE FROM attmp6 WHERE b > 20;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20;	-- succeeds
+-- An already validated constraint must not be revalidated
+CREATE FUNCTION boo(int) RETURNS int IMMUTABLE STRICT LANGUAGE plpgsql AS $$ BEGIN RAISE NOTICE 'boo: %', $1; RETURN $1; END; $$;
+INSERT INTO attmp7 VALUES (8, 18);
+ALTER TABLE attmp7 ADD CONSTRAINT identity CHECK (b = boo(b));
+NOTICE:  boo: 18
+ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
+NOTICE:  merging constraint "identity" with inherited definition
+ALTER TABLE attmp3 VALIDATE CONSTRAINT identity;
+NOTICE:  boo: 20
+NOTICE:  boo: 16
+-- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
+create table parent_noinh_convalid (a int);
+create table child_noinh_convalid () inherits (parent_noinh_convalid);
+insert into parent_noinh_convalid values (1);
+insert into child_noinh_convalid values (1);
+alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid;
+-- fail, because of the row in parent
+alter table parent_noinh_convalid validate constraint check_a_is_2;
+ERROR:  check constraint "check_a_is_2" of relation "parent_noinh_convalid" is violated by some row
+delete from only parent_noinh_convalid;
+-- ok (parent itself contains no violating rows)
+alter table parent_noinh_convalid validate constraint check_a_is_2;
+select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2';
+ convalidated 
+--------------
+ t
+(1 row)
+
+-- cleanup
+drop table parent_noinh_convalid, child_noinh_convalid;
+-- Try (and fail) to create constraint from attmp5(a) to attmp4(a) - unique constraint on
+-- attmp4 is a,b
+ALTER TABLE attmp5 add constraint attmpconstr foreign key(a) references attmp4(a) match full;
+ERROR:  there is no unique constraint matching given keys for referenced table "attmp4"
+DROP TABLE attmp7;
+DROP TABLE attmp6;
+DROP TABLE attmp5;
+DROP TABLE attmp4;
+DROP TABLE attmp3;
+DROP TABLE attmp2;
+-- NOT VALID with plan invalidation -- ensure we don't use a constraint for
+-- exclusion until validated
+set constraint_exclusion TO 'partition';
+create table nv_parent (d date, check (false) no inherit not valid);
+-- not valid constraint added at creation time should automatically become valid
+\d nv_parent
+            Table "public.nv_parent"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ d      | date |           |          | 
+Check constraints:
+    "nv_parent_check" CHECK (false) NO INHERIT
+
+create table nv_child_2010 () inherits (nv_parent);
+create table nv_child_2011 () inherits (nv_parent);
+alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid;
+alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid;
+explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31';
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2011 nv_parent_3
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+(7 rows)
+
+create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent);
+explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2011 nv_parent_3
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+(7 rows)
+
+explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2011 nv_parent_3
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2009 nv_parent_4
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+(9 rows)
+
+-- after validation, the constraint should be used
+alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check;
+explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2009 nv_parent_3
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+(7 rows)
+
+-- add an inherited NOT VALID constraint
+alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid;
+\d nv_child_2009
+          Table "public.nv_child_2009"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ d      | date |           |          | 
+Check constraints:
+    "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date)
+    "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID
+Inherits: nv_parent
+
+-- we leave nv_parent and children around to help test pg_dump logic
+-- Foreign key adding test with mixed types
+-- Note: these tables are TEMP to avoid name conflicts when this test
+-- is run in parallel with foreign_key.sql.
+CREATE TEMP TABLE PKTABLE (ptest1 int PRIMARY KEY);
+INSERT INTO PKTABLE VALUES(42);
+CREATE TEMP TABLE FKTABLE (ftest1 inet);
+-- This next should fail, because int=inet does not exist
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer.
+-- This should also fail for the same reason, but here we
+-- give the column name
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable(ptest1);
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer.
+DROP TABLE FKTABLE;
+-- This should succeed, even though they are different types,
+-- because int=int8 exists and is a member of the integer opfamily
+CREATE TEMP TABLE FKTABLE (ftest1 int8);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+-- Check it actually works
+INSERT INTO FKTABLE VALUES(42);		-- should succeed
+INSERT INTO FKTABLE VALUES(43);		-- should fail
+ERROR:  insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey"
+DETAIL:  Key (ftest1)=(43) is not present in table "pktable".
+DROP TABLE FKTABLE;
+-- This should fail, because we'd have to cast numeric to int which is
+-- not an implicit coercion (or use numeric=numeric, but that's not part
+-- of the integer opfamily)
+CREATE TEMP TABLE FKTABLE (ftest1 numeric);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: numeric and integer.
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+-- On the other hand, this should work because int implicitly promotes to
+-- numeric, and we allow promotion on the FK side
+CREATE TEMP TABLE PKTABLE (ptest1 numeric PRIMARY KEY);
+INSERT INTO PKTABLE VALUES(42);
+CREATE TEMP TABLE FKTABLE (ftest1 int);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+-- Check it actually works
+INSERT INTO FKTABLE VALUES(42);		-- should succeed
+INSERT INTO FKTABLE VALUES(43);		-- should fail
+ERROR:  insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey"
+DETAIL:  Key (ftest1)=(43) is not present in table "pktable".
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+CREATE TEMP TABLE PKTABLE (ptest1 int, ptest2 inet,
+                           PRIMARY KEY(ptest1, ptest2));
+-- This should fail, because we just chose really odd types
+CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer.
+DROP TABLE FKTABLE;
+-- Again, so should this...
+CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2)
+     references pktable(ptest1, ptest2);
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer.
+DROP TABLE FKTABLE;
+-- This fails because we mixed up the column ordering
+CREATE TEMP TABLE FKTABLE (ftest1 int, ftest2 inet);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2)
+     references pktable(ptest2, ptest1);
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest2" are of incompatible types: integer and inet.
+-- As does this...
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1)
+     references pktable(ptest1, ptest2);
+ERROR:  foreign key constraint "fktable_ftest2_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer.
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+-- Test that ALTER CONSTRAINT updates trigger deferrability properly
+CREATE TEMP TABLE PKTABLE (ptest1 int primary key);
+CREATE TEMP TABLE FKTABLE (ftest1 int);
+ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE;
+SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+WHERE tgrelid = 'pktable'::regclass
+ORDER BY 1,2,3;
+ conname |         tgfoid         | tgtype | tgdeferrable | tginitdeferred 
+---------+------------------------+--------+--------------+----------------
+ fkdd    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdd    | "RI_FKey_noaction_upd" |     17 | t            | t
+ fkdd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdd2   | "RI_FKey_noaction_upd" |     17 | t            | t
+ fkdi    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdi    | "RI_FKey_noaction_upd" |     17 | t            | f
+ fkdi2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdi2   | "RI_FKey_noaction_upd" |     17 | t            | f
+ fknd    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fknd    | "RI_FKey_noaction_upd" |     17 | f            | f
+ fknd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fknd2   | "RI_FKey_noaction_upd" |     17 | f            | f
+(12 rows)
+
+SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+WHERE tgrelid = 'fktable'::regclass
+ORDER BY 1,2,3;
+ conname |       tgfoid        | tgtype | tgdeferrable | tginitdeferred 
+---------+---------------------+--------+--------------+----------------
+ fkdd    | "RI_FKey_check_ins" |      5 | t            | t
+ fkdd    | "RI_FKey_check_upd" |     17 | t            | t
+ fkdd2   | "RI_FKey_check_ins" |      5 | t            | t
+ fkdd2   | "RI_FKey_check_upd" |     17 | t            | t
+ fkdi    | "RI_FKey_check_ins" |      5 | t            | f
+ fkdi    | "RI_FKey_check_upd" |     17 | t            | f
+ fkdi2   | "RI_FKey_check_ins" |      5 | t            | f
+ fkdi2   | "RI_FKey_check_upd" |     17 | t            | f
+ fknd    | "RI_FKey_check_ins" |      5 | f            | f
+ fknd    | "RI_FKey_check_upd" |     17 | f            | f
+ fknd2   | "RI_FKey_check_ins" |      5 | f            | f
+ fknd2   | "RI_FKey_check_upd" |     17 | f            | f
+(12 rows)
+
+-- temp tables should go away by themselves, need not drop them.
+-- test check constraint adding
+create table atacc1 ( test int );
+-- add a check constraint
+alter table atacc1 add constraint atacc_test1 check (test>3);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc_test1"
+DETAIL:  Failing row contains (2).
+-- should succeed
+insert into atacc1 (test) values (4);
+drop table atacc1;
+-- let's do one where the check fails when added
+create table atacc1 ( test int );
+-- insert a soon to be failing row
+insert into atacc1 (test) values (2);
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test>3);
+ERROR:  check constraint "atacc_test1" of relation "atacc1" is violated by some row
+insert into atacc1 (test) values (4);
+drop table atacc1;
+-- let's do one where the check fails because the column doesn't exist
+create table atacc1 ( test int );
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test1>3);
+ERROR:  column "test1" does not exist
+HINT:  Perhaps you meant to reference the column "atacc1.test".
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int, test3 int);
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test+test2<test3*4);
+-- should fail
+insert into atacc1 (test,test2,test3) values (4,4,2);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc_test1"
+DETAIL:  Failing row contains (4, 4, 2).
+-- should succeed
+insert into atacc1 (test,test2,test3) values (4,4,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int check (test>3), test2 int);
+alter table atacc1 add check (test2>test);
+-- should fail for $2
+insert into atacc1 (test2, test) values (3, 4);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_check"
+DETAIL:  Failing row contains (4, 3).
+drop table atacc1;
+-- inheritance related tests
+create table atacc1 (test int);
+create table atacc2 (test2 int);
+create table atacc3 (test3 int) inherits (atacc1, atacc2);
+alter table atacc2 add constraint foo check (test2>0);
+-- fail and then succeed on atacc2
+insert into atacc2 (test2) values (-3);
+ERROR:  new row for relation "atacc2" violates check constraint "foo"
+DETAIL:  Failing row contains (-3).
+insert into atacc2 (test2) values (3);
+-- fail and then succeed on atacc3
+insert into atacc3 (test2) values (-3);
+ERROR:  new row for relation "atacc3" violates check constraint "foo"
+DETAIL:  Failing row contains (null, -3, null).
+insert into atacc3 (test2) values (3);
+drop table atacc3;
+drop table atacc2;
+drop table atacc1;
+-- same things with one created with INHERIT
+create table atacc1 (test int);
+create table atacc2 (test2 int);
+create table atacc3 (test3 int) inherits (atacc1, atacc2);
+alter table atacc3 no inherit atacc2;
+-- fail
+alter table atacc3 no inherit atacc2;
+ERROR:  relation "atacc2" is not a parent of relation "atacc3"
+-- make sure it really isn't a child
+insert into atacc3 (test2) values (3);
+select test2 from atacc2;
+ test2 
+-------
+(0 rows)
+
+-- fail due to missing constraint
+alter table atacc2 add constraint foo check (test2>0);
+alter table atacc3 inherit atacc2;
+ERROR:  child table is missing constraint "foo"
+-- fail due to missing column
+alter table atacc3 rename test2 to testx;
+alter table atacc3 inherit atacc2;
+ERROR:  child table is missing column "test2"
+-- fail due to mismatched data type
+alter table atacc3 add test2 bool;
+alter table atacc3 inherit atacc2;
+ERROR:  child table "atacc3" has different type for column "test2"
+alter table atacc3 drop test2;
+-- succeed
+alter table atacc3 add test2 int;
+update atacc3 set test2 = 4 where test2 is null;
+alter table atacc3 add constraint foo check (test2>0);
+alter table atacc3 inherit atacc2;
+-- fail due to duplicates and circular inheritance
+alter table atacc3 inherit atacc2;
+ERROR:  relation "atacc2" would be inherited from more than once
+alter table atacc2 inherit atacc3;
+ERROR:  circular inheritance not allowed
+DETAIL:  "atacc3" is already a child of "atacc2".
+alter table atacc2 inherit atacc2;
+ERROR:  circular inheritance not allowed
+DETAIL:  "atacc2" is already a child of "atacc2".
+-- test that we really are a child now (should see 4 not 3 and cascade should go through)
+select test2 from atacc2;
+ test2 
+-------
+     4
+(1 row)
+
+drop table atacc2 cascade;
+NOTICE:  drop cascades to table atacc3
+drop table atacc1;
+-- adding only to a parent is allowed as of 9.2
+create table atacc1 (test int);
+create table atacc2 (test2 int) inherits (atacc1);
+-- ok:
+alter table atacc1 add constraint foo check (test>0) no inherit;
+-- check constraint is not there on child
+insert into atacc2 (test) values (-3);
+-- check constraint is there on parent
+insert into atacc1 (test) values (-3);
+ERROR:  new row for relation "atacc1" violates check constraint "foo"
+DETAIL:  Failing row contains (-3).
+insert into atacc1 (test) values (3);
+-- fail, violating row:
+alter table atacc2 add constraint foo check (test>0) no inherit;
+ERROR:  check constraint "foo" of relation "atacc2" is violated by some row
+drop table atacc2;
+drop table atacc1;
+-- test unique constraint adding
+create table atacc1 ( test int ) ;
+-- add a unique constraint
+alter table atacc1 add constraint atacc_test1 unique (test);
+-- insert first value
+insert into atacc1 (test) values (2);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test)=(2) already exists.
+-- should succeed
+insert into atacc1 (test) values (4);
+-- try to create duplicates via alter table using - should fail
+alter table atacc1 alter column test type integer using 0;
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(0) is duplicated.
+drop table atacc1;
+-- let's do one where the unique constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing rows
+insert into atacc1 (test) values (2);
+insert into atacc1 (test) values (2);
+-- add a unique constraint (fails)
+alter table atacc1 add constraint atacc_test1 unique (test);
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(2) is duplicated.
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do one where the unique constraint fails
+-- because the column doesn't exist
+create table atacc1 ( test int );
+-- add a unique constraint (fails)
+alter table atacc1 add constraint atacc_test1 unique (test1);
+ERROR:  column "test1" named in key does not exist
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int);
+-- add a unique constraint
+alter table atacc1 add constraint atacc_test1 unique (test, test2);
+-- insert initial value
+insert into atacc1 (test,test2) values (4,4);
+-- should fail
+insert into atacc1 (test,test2) values (4,4);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test, test2)=(4, 4) already exists.
+-- should all succeed
+insert into atacc1 (test,test2) values (4,5);
+insert into atacc1 (test,test2) values (5,4);
+insert into atacc1 (test,test2) values (5,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int, test2 int, unique(test));
+alter table atacc1 add unique (test2);
+-- should fail for @@ second one @@
+insert into atacc1 (test2, test) values (3, 3);
+insert into atacc1 (test2, test) values (2, 3);
+ERROR:  duplicate key value violates unique constraint "atacc1_test_key"
+DETAIL:  Key (test)=(3) already exists.
+drop table atacc1;
+-- test primary key constraint adding
+create table atacc1 ( id serial, test int) ;
+-- add a primary key constraint
+alter table atacc1 add constraint atacc_test1 primary key (test);
+-- insert first value
+insert into atacc1 (test) values (2);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test)=(2) already exists.
+-- should succeed
+insert into atacc1 (test) values (4);
+-- inserting NULL should fail
+insert into atacc1 (test) values(NULL);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (4, null).
+-- try adding a second primary key (should fail)
+alter table atacc1 add constraint atacc_oid1 primary key(id);
+ERROR:  multiple primary keys for table "atacc1" are not allowed
+-- drop first primary key constraint
+alter table atacc1 drop constraint atacc_test1 restrict;
+-- try adding a primary key on oid (should succeed)
+alter table atacc1 add constraint atacc_oid1 primary key(id);
+drop table atacc1;
+-- let's do one where the primary key constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing rows
+insert into atacc1 (test) values (2);
+insert into atacc1 (test) values (2);
+-- add a primary key (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test);
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(2) is duplicated.
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do another one where the primary key constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing row
+insert into atacc1 (test) values (NULL);
+-- add a primary key (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test);
+ERROR:  column "test" of relation "atacc1" contains null values
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do one where the primary key constraint fails
+-- because the column doesn't exist
+create table atacc1 ( test int );
+-- add a primary key constraint (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test1);
+ERROR:  column "test1" of relation "atacc1" does not exist
+drop table atacc1;
+-- adding a new column as primary key to a non-empty table.
+-- should fail unless the column has a non-null default value.
+create table atacc1 ( test int );
+insert into atacc1 (test) values (0);
+-- add a primary key column without a default (fails).
+alter table atacc1 add column test2 int primary key;
+ERROR:  column "test2" of relation "atacc1" contains null values
+-- now add a primary key column with a default (succeeds).
+alter table atacc1 add column test2 int default 0 primary key;
+drop table atacc1;
+-- this combination used to have order-of-execution problems (bug #15580)
+create table atacc1 (a int);
+insert into atacc1 values(1);
+alter table atacc1
+  add column b float8 not null default random(),
+  add primary key(a);
+drop table atacc1;
+-- additionally, we've seen issues with foreign key validation not being
+-- properly delayed until after a table rewrite.  Check that works ok.
+create table atacc1 (a int primary key);
+alter table atacc1 add constraint atacc1_fkey foreign key (a) references atacc1 (a) not valid;
+alter table atacc1 validate constraint atacc1_fkey, alter a type bigint;
+drop table atacc1;
+-- we've also seen issues with check constraints being validated at the wrong
+-- time when there's a pending table rewrite.
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,1);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+drop table atacc1;
+-- same as above, but ensure the constraint violation is detected
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,2);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+ERROR:  check constraint "atacc1_chk" of relation "atacc1" is violated by some row
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int);
+-- add a primary key constraint
+alter table atacc1 add constraint atacc_test1 primary key (test, test2);
+-- try adding a second primary key - should fail
+alter table atacc1 add constraint atacc_test2 primary key (test);
+ERROR:  multiple primary keys for table "atacc1" are not allowed
+-- insert initial value
+insert into atacc1 (test,test2) values (4,4);
+-- should fail
+insert into atacc1 (test,test2) values (4,4);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test, test2)=(4, 4) already exists.
+insert into atacc1 (test,test2) values (NULL,3);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, 3).
+insert into atacc1 (test,test2) values (3, NULL);
+ERROR:  null value in column "test2" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (3, null).
+insert into atacc1 (test,test2) values (NULL,NULL);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, null).
+-- should all succeed
+insert into atacc1 (test,test2) values (4,5);
+insert into atacc1 (test,test2) values (5,4);
+insert into atacc1 (test,test2) values (5,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int, test2 int, primary key(test));
+-- only first should succeed
+insert into atacc1 (test2, test) values (3, 3);
+insert into atacc1 (test2, test) values (2, 3);
+ERROR:  duplicate key value violates unique constraint "atacc1_pkey"
+DETAIL:  Key (test)=(3) already exists.
+insert into atacc1 (test2, test) values (1, NULL);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, 1).
+drop table atacc1;
+-- alter table / alter column [set/drop] not null tests
+-- try altering system catalogs, should fail
+alter table pg_class alter column relname drop not null;
+ERROR:  permission denied: "pg_class" is a system catalog
+alter table pg_class alter relname set not null;
+ERROR:  permission denied: "pg_class" is a system catalog
+-- try altering non-existent table, should fail
+alter table non_existent alter column bar set not null;
+ERROR:  relation "non_existent" does not exist
+alter table non_existent alter column bar drop not null;
+ERROR:  relation "non_existent" does not exist
+-- test setting columns to null and not null and vice versa
+-- test checking for null values and primary key
+create table atacc1 (test int not null);
+alter table atacc1 add constraint "atacc1_pkey" primary key (test);
+alter table atacc1 alter column test drop not null;
+ERROR:  column "test" is in a primary key
+alter table atacc1 drop constraint "atacc1_pkey";
+alter table atacc1 alter column test drop not null;
+insert into atacc1 values (null);
+alter table atacc1 alter test set not null;
+ERROR:  column "test" of relation "atacc1" contains null values
+delete from atacc1;
+alter table atacc1 alter test set not null;
+-- try altering a non-existent column, should fail
+alter table atacc1 alter bar set not null;
+ERROR:  column "bar" of relation "atacc1" does not exist
+alter table atacc1 alter bar drop not null;
+ERROR:  column "bar" of relation "atacc1" does not exist
+-- try creating a view and altering that, should fail
+create view myview as select * from atacc1;
+alter table myview alter column test drop not null;
+ERROR:  "myview" is not a table or foreign table
+alter table myview alter column test set not null;
+ERROR:  "myview" is not a table or foreign table
+drop view myview;
+drop table atacc1;
+-- set not null verified by constraints
+create table atacc1 (test_a int, test_b int);
+insert into atacc1 values (null, 1);
+-- constraint not cover all values, should fail
+alter table atacc1 add constraint atacc1_constr_or check(test_a is not null or test_b < 10);
+alter table atacc1 alter test_a set not null;
+ERROR:  column "test_a" of relation "atacc1" contains null values
+alter table atacc1 drop constraint atacc1_constr_or;
+-- not valid constraint, should fail
+alter table atacc1 add constraint atacc1_constr_invalid check(test_a is not null) not valid;
+alter table atacc1 alter test_a set not null;
+ERROR:  column "test_a" of relation "atacc1" contains null values
+alter table atacc1 drop constraint atacc1_constr_invalid;
+-- with valid constraint
+update atacc1 set test_a = 1;
+alter table atacc1 add constraint atacc1_constr_a_valid check(test_a is not null);
+alter table atacc1 alter test_a set not null;
+delete from atacc1;
+insert into atacc1 values (2, null);
+alter table atacc1 alter test_a drop not null;
+-- test multiple set not null at same time
+-- test_a checked by atacc1_constr_a_valid, test_b should fail by table scan
+alter table atacc1 alter test_a set not null, alter test_b set not null;
+ERROR:  column "test_b" of relation "atacc1" contains null values
+-- commands order has no importance
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+ERROR:  column "test_b" of relation "atacc1" contains null values
+-- valid one by table scan, one by check constraints
+update atacc1 set test_b = 1;
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+alter table atacc1 alter test_a drop not null, alter test_b drop not null;
+-- both column has check constraints
+alter table atacc1 add constraint atacc1_constr_b_valid check(test_b is not null);
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+drop table atacc1;
+-- test inheritance
+create table parent (a int);
+create table child (b varchar(255)) inherits (parent);
+alter table parent alter a set not null;
+insert into parent values (NULL);
+ERROR:  null value in column "a" of relation "parent" violates not-null constraint
+DETAIL:  Failing row contains (null).
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" of relation "child" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+alter table parent alter a drop not null;
+insert into parent values (NULL);
+insert into child (a, b) values (NULL, 'foo');
+alter table only parent alter a set not null;
+ERROR:  column "a" of relation "parent" contains null values
+alter table child alter a set not null;
+ERROR:  column "a" of relation "child" contains null values
+delete from parent;
+alter table only parent alter a set not null;
+insert into parent values (NULL);
+ERROR:  null value in column "a" of relation "parent" violates not-null constraint
+DETAIL:  Failing row contains (null).
+alter table child alter a set not null;
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" of relation "child" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+delete from child;
+alter table child alter a set not null;
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" of relation "child" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+drop table child;
+drop table parent;
+-- test setting and removing default values
+create table def_test (
+	c1	int4 default 5,
+	c2	text default 'initial_default'
+);
+insert into def_test default values;
+alter table def_test alter column c1 drop default;
+insert into def_test default values;
+alter table def_test alter column c2 drop default;
+insert into def_test default values;
+alter table def_test alter column c1 set default 10;
+alter table def_test alter column c2 set default 'new_default';
+insert into def_test default values;
+select * from def_test;
+ c1 |       c2        
+----+-----------------
+  5 | initial_default
+    | initial_default
+    | 
+ 10 | new_default
+(4 rows)
+
+-- set defaults to an incorrect type: this should fail
+alter table def_test alter column c1 set default 'wrong_datatype';
+ERROR:  invalid input syntax for type integer: "wrong_datatype"
+alter table def_test alter column c2 set default 20;
+-- set defaults on a non-existent column: this should fail
+alter table def_test alter column c3 set default 30;
+ERROR:  column "c3" of relation "def_test" does not exist
+-- set defaults on views: we need to create a view, add a rule
+-- to allow insertions into it, and then alter the view to add
+-- a default
+create view def_view_test as select * from def_test;
+create rule def_view_test_ins as
+	on insert to def_view_test
+	do instead insert into def_test select new.*;
+insert into def_view_test default values;
+alter table def_view_test alter column c1 set default 45;
+insert into def_view_test default values;
+alter table def_view_test alter column c2 set default 'view_default';
+insert into def_view_test default values;
+select * from def_view_test;
+ c1 |       c2        
+----+-----------------
+  5 | initial_default
+    | initial_default
+    | 
+ 10 | new_default
+    | 
+ 45 | 
+ 45 | view_default
+(7 rows)
+
+drop rule def_view_test_ins on def_view_test;
+drop view def_view_test;
+drop table def_test;
+-- alter table / drop column tests
+-- try altering system catalogs, should fail
+alter table pg_class drop column relname;
+ERROR:  permission denied: "pg_class" is a system catalog
+-- try altering non-existent table, should fail
+alter table nosuchtable drop column bar;
+ERROR:  relation "nosuchtable" does not exist
+-- test dropping columns
+create table atacc1 (a int4 not null, b int4, c int4 not null, d int4);
+insert into atacc1 values (1, 2, 3, 4);
+alter table atacc1 drop a;
+alter table atacc1 drop a;
+ERROR:  column "a" of relation "atacc1" does not exist
+-- SELECTs
+select * from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select * from atacc1 order by a;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 order by a;
+                                      ^
+select * from atacc1 order by "........pg.dropped.1........";
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 order by "........pg.dropped.1........"...
+                                      ^
+select * from atacc1 group by a;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 group by a;
+                                      ^
+select * from atacc1 group by "........pg.dropped.1........";
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 group by "........pg.dropped.1........"...
+                                      ^
+select atacc1.* from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select a from atacc1;
+ERROR:  column "a" does not exist
+LINE 1: select a from atacc1;
+               ^
+select atacc1.a from atacc1;
+ERROR:  column atacc1.a does not exist
+LINE 1: select atacc1.a from atacc1;
+               ^
+select b,c,d from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select a,b,c,d from atacc1;
+ERROR:  column "a" does not exist
+LINE 1: select a,b,c,d from atacc1;
+               ^
+select * from atacc1 where a = 1;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 where a = 1;
+                                   ^
+select "........pg.dropped.1........" from atacc1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select "........pg.dropped.1........" from atacc1;
+               ^
+select atacc1."........pg.dropped.1........" from atacc1;
+ERROR:  column atacc1.........pg.dropped.1........ does not exist
+LINE 1: select atacc1."........pg.dropped.1........" from atacc1;
+               ^
+select "........pg.dropped.1........",b,c,d from atacc1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select "........pg.dropped.1........",b,c,d from atacc1;
+               ^
+select * from atacc1 where "........pg.dropped.1........" = 1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 where "........pg.dropped.1........" = ...
+                                   ^
+-- UPDATEs
+update atacc1 set a = 3;
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: update atacc1 set a = 3;
+                          ^
+update atacc1 set b = 2 where a = 3;
+ERROR:  column "a" does not exist
+LINE 1: update atacc1 set b = 2 where a = 3;
+                                      ^
+update atacc1 set "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: update atacc1 set "........pg.dropped.1........" = 3;
+                          ^
+update atacc1 set b = 2 where "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: update atacc1 set b = 2 where "........pg.dropped.1........"...
+                                      ^
+-- INSERTs
+insert into atacc1 values (10, 11, 12, 13);
+ERROR:  INSERT has more expressions than target columns
+LINE 1: insert into atacc1 values (10, 11, 12, 13);
+                                               ^
+insert into atacc1 values (default, 11, 12, 13);
+ERROR:  INSERT has more expressions than target columns
+LINE 1: insert into atacc1 values (default, 11, 12, 13);
+                                                    ^
+insert into atacc1 values (11, 12, 13);
+insert into atacc1 (a) values (10);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a) values (10);
+                            ^
+insert into atacc1 (a) values (default);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a) values (default);
+                            ^
+insert into atacc1 (a,b,c,d) values (10,11,12,13);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a,b,c,d) values (10,11,12,13);
+                            ^
+insert into atacc1 (a,b,c,d) values (default,11,12,13);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a,b,c,d) values (default,11,12,13);
+                            ^
+insert into atacc1 (b,c,d) values (11,12,13);
+insert into atacc1 ("........pg.dropped.1........") values (10);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........") values (...
+                            ^
+insert into atacc1 ("........pg.dropped.1........") values (default);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........") values (...
+                            ^
+insert into atacc1 ("........pg.dropped.1........",b,c,d) values (10,11,12,13);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va...
+                            ^
+insert into atacc1 ("........pg.dropped.1........",b,c,d) values (default,11,12,13);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va...
+                            ^
+-- DELETEs
+delete from atacc1 where a = 3;
+ERROR:  column "a" does not exist
+LINE 1: delete from atacc1 where a = 3;
+                                 ^
+delete from atacc1 where "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: delete from atacc1 where "........pg.dropped.1........" = 3;
+                                 ^
+delete from atacc1;
+-- try dropping a non-existent column, should fail
+alter table atacc1 drop bar;
+ERROR:  column "bar" of relation "atacc1" does not exist
+-- try removing an oid column, should succeed (as it's nonexistent)
+alter table atacc1 SET WITHOUT OIDS;
+-- try adding an oid column, should fail (not supported)
+alter table atacc1 SET WITH OIDS;
+ERROR:  syntax error at or near "WITH"
+LINE 1: alter table atacc1 SET WITH OIDS;
+                               ^
+-- try dropping the xmin column, should fail
+alter table atacc1 drop xmin;
+ERROR:  cannot drop system column "xmin"
+-- try creating a view and altering that, should fail
+create view myview as select * from atacc1;
+select * from myview;
+ b | c | d 
+---+---+---
+(0 rows)
+
+alter table myview drop d;
+ERROR:  "myview" is not a table, composite type, or foreign table
+drop view myview;
+-- test some commands to make sure they fail on the dropped column
+analyze atacc1(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+analyze atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+vacuum analyze atacc1(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+vacuum analyze atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+comment on column atacc1.a is 'testing';
+ERROR:  column "a" of relation "atacc1" does not exist
+comment on column atacc1."........pg.dropped.1........" is 'testing';
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set storage plain;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set storage plain;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set statistics 0;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set statistics 0;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set default 3;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set default 3;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a drop default;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" drop default;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set not null;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set not null;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a drop not null;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" drop not null;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 rename a to x;
+ERROR:  column "a" does not exist
+alter table atacc1 rename "........pg.dropped.1........" to x;
+ERROR:  column "........pg.dropped.1........" does not exist
+alter table atacc1 add primary key(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 add primary key("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 add unique(a);
+ERROR:  column "a" named in key does not exist
+alter table atacc1 add unique("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" named in key does not exist
+alter table atacc1 add check (a > 3);
+ERROR:  column "a" does not exist
+alter table atacc1 add check ("........pg.dropped.1........" > 3);
+ERROR:  column "........pg.dropped.1........" does not exist
+create table atacc2 (id int4 unique);
+alter table atacc1 add foreign key (a) references atacc2(id);
+ERROR:  column "a" referenced in foreign key constraint does not exist
+alter table atacc1 add foreign key ("........pg.dropped.1........") references atacc2(id);
+ERROR:  column "........pg.dropped.1........" referenced in foreign key constraint does not exist
+alter table atacc2 add foreign key (id) references atacc1(a);
+ERROR:  column "a" referenced in foreign key constraint does not exist
+alter table atacc2 add foreign key (id) references atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" referenced in foreign key constraint does not exist
+drop table atacc2;
+create index "testing_idx" on atacc1(a);
+ERROR:  column "a" does not exist
+create index "testing_idx" on atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" does not exist
+-- test create as and select into
+insert into atacc1 values (21, 22, 23);
+create table attest1 as select * from atacc1;
+select * from attest1;
+ b  | c  | d  
+----+----+----
+ 21 | 22 | 23
+(1 row)
+
+drop table attest1;
+select * into attest2 from atacc1;
+select * from attest2;
+ b  | c  | d  
+----+----+----
+ 21 | 22 | 23
+(1 row)
+
+drop table attest2;
+-- try dropping all columns
+alter table atacc1 drop c;
+alter table atacc1 drop d;
+alter table atacc1 drop b;
+select * from atacc1;
+--
+(1 row)
+
+drop table atacc1;
+-- test constraint error reporting in presence of dropped columns
+create table atacc1 (id serial primary key, value int check (value < 10));
+insert into atacc1(value) values (100);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_value_check"
+DETAIL:  Failing row contains (1, 100).
+alter table atacc1 drop column value;
+alter table atacc1 add column value int check (value < 10);
+insert into atacc1(value) values (100);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_value_check"
+DETAIL:  Failing row contains (2, 100).
+insert into atacc1(id, value) values (null, 0);
+ERROR:  null value in column "id" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, 0).
+drop table atacc1;
+-- test inheritance
+create table parent (a int, b int, c int);
+insert into parent values (1, 2, 3);
+alter table parent drop a;
+create table child (d varchar(255)) inherits (parent);
+insert into child values (12, 13, 'testing');
+select * from parent;
+ b  | c  
+----+----
+  2 |  3
+ 12 | 13
+(2 rows)
+
+select * from child;
+ b  | c  |    d    
+----+----+---------
+ 12 | 13 | testing
+(1 row)
+
+alter table parent drop c;
+select * from parent;
+ b  
+----
+  2
+ 12
+(2 rows)
+
+select * from child;
+ b  |    d    
+----+---------
+ 12 | testing
+(1 row)
+
+drop table child;
+drop table parent;
+-- check error cases for inheritance column merging
+create table parent (a float8, b numeric(10,4), c text collate "C");
+create table child (a float4) inherits (parent); -- fail
+NOTICE:  merging column "a" with inherited definition
+ERROR:  column "a" has a type conflict
+DETAIL:  double precision versus real
+create table child (b decimal(10,7)) inherits (parent); -- fail
+NOTICE:  moving and merging column "b" with inherited definition
+DETAIL:  User-specified column moved to the position of the inherited column.
+ERROR:  column "b" has a type conflict
+DETAIL:  numeric(10,4) versus numeric(10,7)
+create table child (c text collate "POSIX") inherits (parent); -- fail
+NOTICE:  moving and merging column "c" with inherited definition
+DETAIL:  User-specified column moved to the position of the inherited column.
+ERROR:  column "c" has a collation conflict
+DETAIL:  "C" versus "POSIX"
+create table child (a double precision, b decimal(10,4)) inherits (parent);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging column "b" with inherited definition
+drop table child;
+drop table parent;
+-- test copy in/out
+create table attest (a int4, b int4, c int4);
+insert into attest values (1,2,3);
+alter table attest drop a;
+copy attest to stdout;
+2	3
+copy attest(a) to stdout;
+ERROR:  column "a" of relation "attest" does not exist
+copy attest("........pg.dropped.1........") to stdout;
+ERROR:  column "........pg.dropped.1........" of relation "attest" does not exist
+copy attest from stdin;
+ERROR:  extra data after last expected column
+CONTEXT:  COPY attest, line 1: "10	11	12"
+select * from attest;
+ b | c 
+---+---
+ 2 | 3
+(1 row)
+
+copy attest from stdin;
+select * from attest;
+ b  | c  
+----+----
+  2 |  3
+ 21 | 22
+(2 rows)
+
+copy attest(a) from stdin;
+ERROR:  column "a" of relation "attest" does not exist
+copy attest("........pg.dropped.1........") from stdin;
+ERROR:  column "........pg.dropped.1........" of relation "attest" does not exist
+copy attest(b,c) from stdin;
+select * from attest;
+ b  | c  
+----+----
+  2 |  3
+ 21 | 22
+ 31 | 32
+(3 rows)
+
+drop table attest;
+-- test inheritance
+create table dropColumn (a int, b int, e int);
+create table dropColumnChild (c int) inherits (dropColumn);
+create table dropColumnAnother (d int) inherits (dropColumnChild);
+-- these two should fail
+alter table dropColumnchild drop column a;
+ERROR:  cannot drop inherited column "a"
+alter table only dropColumnChild drop column b;
+ERROR:  cannot drop inherited column "b"
+-- these three should work
+alter table only dropColumn drop column e;
+alter table dropColumnChild drop column c;
+alter table dropColumn drop column a;
+create table renameColumn (a int);
+create table renameColumnChild (b int) inherits (renameColumn);
+create table renameColumnAnother (c int) inherits (renameColumnChild);
+-- these three should fail
+alter table renameColumnChild rename column a to d;
+ERROR:  cannot rename inherited column "a"
+alter table only renameColumnChild rename column a to d;
+ERROR:  inherited column "a" must be renamed in child tables too
+alter table only renameColumn rename column a to d;
+ERROR:  inherited column "a" must be renamed in child tables too
+-- these should work
+alter table renameColumn rename column a to d;
+alter table renameColumnChild rename column b to a;
+-- these should work
+alter table if exists doesnt_exist_tab rename column a to d;
+NOTICE:  relation "doesnt_exist_tab" does not exist, skipping
+alter table if exists doesnt_exist_tab rename column b to a;
+NOTICE:  relation "doesnt_exist_tab" does not exist, skipping
+-- this should work
+alter table renameColumn add column w int;
+-- this should fail
+alter table only renameColumn add column x int;
+ERROR:  column must be added to child tables too
+-- Test corner cases in dropping of inherited columns
+create table p1 (f1 int, f2 int);
+create table c1 (f1 int not null) inherits(p1);
+NOTICE:  merging column "f1" with inherited definition
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+-- should work
+alter table p1 drop column f1;
+-- c1.f1 is still there, but no longer inherited
+select f1 from c1;
+ f1 
+----
+(0 rows)
+
+alter table c1 drop column f1;
+select f1 from c1;
+ERROR:  column "f1" does not exist
+LINE 1: select f1 from c1;
+               ^
+HINT:  Perhaps you meant to reference the column "c1.f2".
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 () inherits(p1);
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table p1 drop column f1;
+-- c1.f1 is dropped now, since there is no local definition for it
+select f1 from c1;
+ERROR:  column "f1" does not exist
+LINE 1: select f1 from c1;
+               ^
+HINT:  Perhaps you meant to reference the column "c1.f2".
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 () inherits(p1);
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table only p1 drop column f1;
+-- c1.f1 is NOT dropped, but must now be considered non-inherited
+alter table c1 drop column f1;
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 (f1 int not null) inherits(p1);
+NOTICE:  merging column "f1" with inherited definition
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table only p1 drop column f1;
+-- c1.f1 is still there, but no longer inherited
+alter table c1 drop column f1;
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1(id int, name text);
+create table p2(id2 int, name text, height int);
+create table c1(age int) inherits(p1,p2);
+NOTICE:  merging multiple inherited definitions of column "name"
+create table gc1() inherits (c1);
+select relname, attname, attinhcount, attislocal
+from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid)
+where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped
+order by relname, attnum;
+ relname | attname | attinhcount | attislocal 
+---------+---------+-------------+------------
+ c1      | id      |           1 | f
+ c1      | name    |           2 | f
+ c1      | id2     |           1 | f
+ c1      | height  |           1 | f
+ c1      | age     |           0 | t
+ gc1     | id      |           1 | f
+ gc1     | name    |           1 | f
+ gc1     | id2     |           1 | f
+ gc1     | height  |           1 | f
+ gc1     | age     |           1 | f
+ p1      | id      |           0 | t
+ p1      | name    |           0 | t
+ p2      | id2     |           0 | t
+ p2      | name    |           0 | t
+ p2      | height  |           0 | t
+(15 rows)
+
+-- should work
+alter table only p1 drop column name;
+-- should work. Now c1.name is local and inhcount is 0.
+alter table p2 drop column name;
+-- should be rejected since its inherited
+alter table gc1 drop column name;
+ERROR:  cannot drop inherited column "name"
+-- should work, and drop gc1.name along
+alter table c1 drop column name;
+-- should fail: column does not exist
+alter table gc1 drop column name;
+ERROR:  column "name" of relation "gc1" does not exist
+-- should work and drop the attribute in all tables
+alter table p2 drop column height;
+-- IF EXISTS test
+create table dropColumnExists ();
+alter table dropColumnExists drop column non_existing; --fail
+ERROR:  column "non_existing" of relation "dropcolumnexists" does not exist
+alter table dropColumnExists drop column if exists non_existing; --succeed
+NOTICE:  column "non_existing" of relation "dropcolumnexists" does not exist, skipping
+select relname, attname, attinhcount, attislocal
+from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid)
+where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped
+order by relname, attnum;
+ relname | attname | attinhcount | attislocal 
+---------+---------+-------------+------------
+ c1      | id      |           1 | f
+ c1      | id2     |           1 | f
+ c1      | age     |           0 | t
+ gc1     | id      |           1 | f
+ gc1     | id2     |           1 | f
+ gc1     | age     |           1 | f
+ p1      | id      |           0 | t
+ p2      | id2     |           0 | t
+(8 rows)
+
+drop table p1, p2 cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table c1
+drop cascades to table gc1
+-- test attinhcount tracking with merged columns
+create table depth0();
+create table depth1(c text) inherits (depth0);
+create table depth2() inherits (depth1);
+alter table depth0 add c text;
+NOTICE:  merging definition of column "c" for child "depth1"
+select attrelid::regclass, attname, attinhcount, attislocal
+from pg_attribute
+where attnum > 0 and attrelid::regclass in ('depth0', 'depth1', 'depth2')
+order by attrelid::regclass::text, attnum;
+ attrelid | attname | attinhcount | attislocal 
+----------+---------+-------------+------------
+ depth0   | c       |           0 | t
+ depth1   | c       |           1 | t
+ depth2   | c       |           1 | f
+(3 rows)
+
+-- test renumbering of child-table columns in inherited operations
+create table p1 (f1 int);
+create table c1 (f2 text, f3 int) inherits (p1);
+alter table p1 add column a1 int check (a1 > 0);
+alter table p1 add column f2 text;
+NOTICE:  merging definition of column "f2" for child "c1"
+insert into p1 values (1,2,'abc');
+insert into c1 values(11,'xyz',33,0); -- should fail
+ERROR:  new row for relation "c1" violates check constraint "p1_a1_check"
+DETAIL:  Failing row contains (11, xyz, 33, 0).
+insert into c1 values(11,'xyz',33,22);
+select * from p1;
+ f1 | a1 | f2  
+----+----+-----
+  1 |  2 | abc
+ 11 | 22 | xyz
+(2 rows)
+
+update p1 set a1 = a1 + 1, f2 = upper(f2);
+select * from p1;
+ f1 | a1 | f2  
+----+----+-----
+  1 |  3 | ABC
+ 11 | 23 | XYZ
+(2 rows)
+
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+-- test that operations with a dropped column do not try to reference
+-- its datatype
+create domain mytype as text;
+create temp table foo (f1 text, f2 mytype, f3 text);
+insert into foo values('bb','cc','dd');
+select * from foo;
+ f1 | f2 | f3 
+----+----+----
+ bb | cc | dd
+(1 row)
+
+drop domain mytype cascade;
+NOTICE:  drop cascades to column f2 of table foo
+select * from foo;
+ f1 | f3 
+----+----
+ bb | dd
+(1 row)
+
+insert into foo values('qq','rr');
+select * from foo;
+ f1 | f3 
+----+----
+ bb | dd
+ qq | rr
+(2 rows)
+
+update foo set f3 = 'zz';
+select * from foo;
+ f1 | f3 
+----+----
+ bb | zz
+ qq | zz
+(2 rows)
+
+select f3,max(f1) from foo group by f3;
+ f3 | max 
+----+-----
+ zz | qq
+(1 row)
+
+-- Simple tests for alter table column type
+alter table foo alter f1 TYPE integer; -- fails
+ERROR:  column "f1" cannot be cast automatically to type integer
+HINT:  You might need to specify "USING f1::integer".
+alter table foo alter f1 TYPE varchar(10);
+create table anothertab (atcol1 serial8, atcol2 boolean,
+	constraint anothertab_chk check (atcol1 <= 3));
+insert into anothertab (atcol1, atcol2) values (default, true);
+insert into anothertab (atcol1, atcol2) values (default, false);
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+(2 rows)
+
+alter table anothertab alter column atcol1 type boolean; -- fails
+ERROR:  column "atcol1" cannot be cast automatically to type boolean
+HINT:  You might need to specify "USING atcol1::boolean".
+alter table anothertab alter column atcol1 type boolean using atcol1::int; -- fails
+ERROR:  result of USING clause for column "atcol1" cannot be cast automatically to type boolean
+HINT:  You might need to add an explicit cast.
+alter table anothertab alter column atcol1 type integer;
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+(2 rows)
+
+insert into anothertab (atcol1, atcol2) values (45, null); -- fails
+ERROR:  new row for relation "anothertab" violates check constraint "anothertab_chk"
+DETAIL:  Failing row contains (45, null).
+insert into anothertab (atcol1, atcol2) values (default, null);
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+      3 | 
+(3 rows)
+
+alter table anothertab alter column atcol2 type text
+      using case when atcol2 is true then 'IT WAS TRUE'
+                 when atcol2 is false then 'IT WAS FALSE'
+                 else 'IT WAS NULL!' end;
+select * from anothertab;
+ atcol1 |    atcol2    
+--------+--------------
+      1 | IT WAS TRUE
+      2 | IT WAS FALSE
+      3 | IT WAS NULL!
+(3 rows)
+
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end; -- fails
+ERROR:  default for column "atcol1" cannot be cast automatically to type boolean
+alter table anothertab alter column atcol1 drop default;
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end; -- fails
+ERROR:  operator does not exist: boolean <= integer
+HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.
+alter table anothertab drop constraint anothertab_chk;
+alter table anothertab drop constraint anothertab_chk; -- fails
+ERROR:  constraint "anothertab_chk" of relation "anothertab" does not exist
+alter table anothertab drop constraint IF EXISTS anothertab_chk; -- succeeds
+NOTICE:  constraint "anothertab_chk" of relation "anothertab" does not exist, skipping
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end;
+select * from anothertab;
+ atcol1 |    atcol2    
+--------+--------------
+ f      | IT WAS TRUE
+ t      | IT WAS FALSE
+ f      | IT WAS NULL!
+(3 rows)
+
+drop table anothertab;
+-- Test index handling in alter table column type (cf. bugs #15835, #15865)
+create table anothertab(f1 int primary key, f2 int unique,
+                        f3 int, f4 int, f5 int);
+alter table anothertab
+  add exclude using btree (f3 with =);
+alter table anothertab
+  add exclude using btree (f4 with =) where (f4 is not null);
+alter table anothertab
+  add exclude using btree (f4 with =) where (f5 > 0);
+alter table anothertab
+  add unique(f1,f4);
+create index on anothertab(f2,f3);
+create unique index on anothertab(f4);
+\d anothertab
+             Table "public.anothertab"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ f1     | integer |           | not null | 
+ f2     | integer |           |          | 
+ f3     | integer |           |          | 
+ f4     | integer |           |          | 
+ f5     | integer |           |          | 
+Indexes:
+    "anothertab_pkey" PRIMARY KEY, btree (f1)
+    "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4)
+    "anothertab_f2_f3_idx" btree (f2, f3)
+    "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2)
+    "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =)
+    "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL)
+    "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0)
+    "anothertab_f4_idx" UNIQUE, btree (f4)
+
+alter table anothertab alter column f1 type bigint;
+alter table anothertab
+  alter column f2 type bigint,
+  alter column f3 type bigint,
+  alter column f4 type bigint;
+alter table anothertab alter column f5 type bigint;
+\d anothertab
+            Table "public.anothertab"
+ Column |  Type  | Collation | Nullable | Default 
+--------+--------+-----------+----------+---------
+ f1     | bigint |           | not null | 
+ f2     | bigint |           |          | 
+ f3     | bigint |           |          | 
+ f4     | bigint |           |          | 
+ f5     | bigint |           |          | 
+Indexes:
+    "anothertab_pkey" PRIMARY KEY, btree (f1)
+    "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4)
+    "anothertab_f2_f3_idx" btree (f2, f3)
+    "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2)
+    "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =)
+    "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL)
+    "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0)
+    "anothertab_f4_idx" UNIQUE, btree (f4)
+
+drop table anothertab;
+-- test that USING expressions are parsed before column alter type / drop steps
+create table another (f1 int, f2 text, f3 text);
+insert into another values(1, 'one', 'uno');
+insert into another values(2, 'two', 'due');
+insert into another values(3, 'three', 'tre');
+select * from another;
+ f1 |  f2   | f3  
+----+-------+-----
+  1 | one   | uno
+  2 | two   | due
+  3 | three | tre
+(3 rows)
+
+alter table another
+  alter f1 type text using f2 || ' and ' || f3 || ' more',
+  alter f2 type bigint using f1 * 10,
+  drop column f3;
+select * from another;
+         f1         | f2 
+--------------------+----
+ one and uno more   | 10
+ two and due more   | 20
+ three and tre more | 30
+(3 rows)
+
+drop table another;
+-- Create an index that skips WAL, then perform a SET DATA TYPE that skips
+-- rewriting the index.
+begin;
+create table skip_wal_skip_rewrite_index (c varchar(10) primary key);
+alter table skip_wal_skip_rewrite_index alter c type varchar(20);
+commit;
+-- table's row type
+create table tab1 (a int, b text);
+create table tab2 (x int, y tab1);
+alter table tab1 alter column b type varchar; -- fails
+ERROR:  cannot alter table "tab1" because column "tab2.y" uses its row type
+-- Alter column type that's part of a partitioned index
+create table at_partitioned (a int, b text) partition by range (a);
+create table at_part_1 partition of at_partitioned for values from (0) to (1000);
+insert into at_partitioned values (512, '0.123');
+create table at_part_2 (b text, a int);
+insert into at_part_2 values ('1.234', 1024);
+create index on at_partitioned (b);
+create index on at_partitioned (a);
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+
+alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000);
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+alter table at_partitioned alter column b type numeric using b::numeric;
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | numeric |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | numeric |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+drop table at_partitioned;
+-- Alter column type when no table rewrite is required
+-- Also check that comments are preserved
+create table at_partitioned(id int, name varchar(64), unique (id, name))
+  partition by hash(id);
+comment on constraint at_partitioned_id_name_key on at_partitioned is 'parent constraint';
+comment on index at_partitioned_id_name_key is 'parent index';
+create table at_partitioned_0 partition of at_partitioned
+  for values with (modulus 2, remainder 0);
+comment on constraint at_partitioned_0_id_name_key on at_partitioned_0 is 'child 0 constraint';
+comment on index at_partitioned_0_id_name_key is 'child 0 index';
+create table at_partitioned_1 partition of at_partitioned
+  for values with (modulus 2, remainder 1);
+comment on constraint at_partitioned_1_id_name_key on at_partitioned_1 is 'child 1 constraint';
+comment on index at_partitioned_1_id_name_key is 'child 1 index';
+insert into at_partitioned values(1, 'foo');
+insert into at_partitioned values(3, 'bar');
+create temp table old_oids as
+  select relname, oid as oldoid, relfilenode as oldfilenode
+  from pg_class where relname like 'at_partitioned%';
+select relname,
+  c.oid = oldoid as orig_oid,
+  case relfilenode
+    when 0 then 'none'
+    when c.oid then 'own'
+    when oldfilenode then 'orig'
+    else 'OTHER'
+    end as storage,
+  obj_description(c.oid, 'pg_class') as desc
+  from pg_class c left join old_oids using (relname)
+  where relname like 'at_partitioned%'
+  order by relname;
+           relname            | orig_oid | storage |     desc      
+------------------------------+----------+---------+---------------
+ at_partitioned               | t        | none    | 
+ at_partitioned_0             | t        | own     | 
+ at_partitioned_0_id_name_key | t        | own     | child 0 index
+ at_partitioned_1             | t        | own     | 
+ at_partitioned_1_id_name_key | t        | own     | child 1 index
+ at_partitioned_id_name_key   | t        | none    | parent index
+(6 rows)
+
+select conname, obj_description(oid, 'pg_constraint') as desc
+  from pg_constraint where conname like 'at_partitioned%'
+  order by conname;
+           conname            |        desc        
+------------------------------+--------------------
+ at_partitioned_0_id_name_key | child 0 constraint
+ at_partitioned_1_id_name_key | child 1 constraint
+ at_partitioned_id_name_key   | parent constraint
+(3 rows)
+
+alter table at_partitioned alter column name type varchar(127);
+-- Note: these tests currently show the wrong behavior for comments :-(
+select relname,
+  c.oid = oldoid as orig_oid,
+  case relfilenode
+    when 0 then 'none'
+    when c.oid then 'own'
+    when oldfilenode then 'orig'
+    else 'OTHER'
+    end as storage,
+  obj_description(c.oid, 'pg_class') as desc
+  from pg_class c left join old_oids using (relname)
+  where relname like 'at_partitioned%'
+  order by relname;
+           relname            | orig_oid | storage |     desc     
+------------------------------+----------+---------+--------------
+ at_partitioned               | t        | none    | 
+ at_partitioned_0             | t        | own     | 
+ at_partitioned_0_id_name_key | f        | own     | parent index
+ at_partitioned_1             | t        | own     | 
+ at_partitioned_1_id_name_key | f        | own     | parent index
+ at_partitioned_id_name_key   | f        | none    | parent index
+(6 rows)
+
+select conname, obj_description(oid, 'pg_constraint') as desc
+  from pg_constraint where conname like 'at_partitioned%'
+  order by conname;
+           conname            |       desc        
+------------------------------+-------------------
+ at_partitioned_0_id_name_key | 
+ at_partitioned_1_id_name_key | 
+ at_partitioned_id_name_key   | parent constraint
+(3 rows)
+
+-- Don't remove this DROP, it exposes bug #15672
+drop table at_partitioned;
+-- disallow recursive containment of row types
+create temp table recur1 (f1 int);
+alter table recur1 add column f2 recur1; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+alter table recur1 add column f2 recur1[]; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+create domain array_of_recur1 as recur1[];
+alter table recur1 add column f2 array_of_recur1; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+create temp table recur2 (f1 int, f2 recur1);
+alter table recur1 add column f2 recur2; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+alter table recur1 add column f2 int;
+alter table recur1 alter column f2 type recur2; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+-- SET STORAGE may need to add a TOAST table
+create table test_storage (a text);
+alter table test_storage alter a set storage plain;
+alter table test_storage add b int default 0; -- rewrite table to remove its TOAST table
+alter table test_storage alter a set storage extended; -- re-add TOAST table
+select reltoastrelid <> 0 as has_toast_table
+from pg_class
+where oid = 'test_storage'::regclass;
+ has_toast_table 
+-----------------
+ t
+(1 row)
+
+-- test that SET STORAGE propagates to index correctly
+create index test_storage_idx on test_storage (b, a);
+alter table test_storage alter column a set storage external;
+\d+ test_storage
+                                Table "public.test_storage"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | external |              | 
+ b      | integer |           |          | 0       | plain    |              | 
+Indexes:
+    "test_storage_idx" btree (b, a)
+
+\d+ test_storage_idx
+                Index "public.test_storage_idx"
+ Column |  Type   | Key? | Definition | Storage  | Stats target 
+--------+---------+------+------------+----------+--------------
+ b      | integer | yes  | b          | plain    | 
+ a      | text    | yes  | a          | external | 
+btree, for table "public.test_storage"
+
+-- ALTER COLUMN TYPE with a check constraint and a child table (bug #13779)
+CREATE TABLE test_inh_check (a float check (a > 10.2), b float);
+CREATE TABLE test_inh_check_child() INHERITS(test_inh_check);
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | double precision |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | double precision |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(2 rows)
+
+ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric;
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(2 rows)
+
+-- also try noinherit, local, and local+inherited cases
+ALTER TABLE test_inh_check ADD CONSTRAINT bnoinherit CHECK (b > 100) NO INHERIT;
+ALTER TABLE test_inh_check_child ADD CONSTRAINT blocal CHECK (b < 1000);
+ALTER TABLE test_inh_check_child ADD CONSTRAINT bmerged CHECK (b > 1);
+ALTER TABLE test_inh_check ADD CONSTRAINT bmerged CHECK (b > 1);
+NOTICE:  merging constraint "bmerged" with inherited definition
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "bmerged" CHECK (b > 1::double precision)
+    "bnoinherit" CHECK (b > 100::double precision) NO INHERIT
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "blocal" CHECK (b < 1000::double precision)
+    "bmerged" CHECK (b > 1::double precision)
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | bmerged                |           0 | t          | f
+ test_inh_check       | bnoinherit             |           0 | t          | t
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | blocal                 |           0 | t          | f
+ test_inh_check_child | bmerged                |           1 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(6 rows)
+
+ALTER TABLE test_inh_check ALTER COLUMN b TYPE numeric;
+NOTICE:  merging constraint "bmerged" with inherited definition
+\d test_inh_check
+           Table "public.test_inh_check"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | numeric |           |          | 
+ b      | numeric |           |          | 
+Check constraints:
+    "bmerged" CHECK (b::double precision > 1::double precision)
+    "bnoinherit" CHECK (b::double precision > 100::double precision) NO INHERIT
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+        Table "public.test_inh_check_child"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | numeric |           |          | 
+ b      | numeric |           |          | 
+Check constraints:
+    "blocal" CHECK (b::double precision < 1000::double precision)
+    "bmerged" CHECK (b::double precision > 1::double precision)
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | bmerged                |           0 | t          | f
+ test_inh_check       | bnoinherit             |           0 | t          | t
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | blocal                 |           0 | t          | f
+ test_inh_check_child | bmerged                |           1 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(6 rows)
+
+-- ALTER COLUMN TYPE with different schema in children
+-- Bug at https://postgr.es/m/20170102225618.GA10071@telsasoft.com
+CREATE TABLE test_type_diff (f1 int);
+CREATE TABLE test_type_diff_c (extra smallint) INHERITS (test_type_diff);
+ALTER TABLE test_type_diff ADD COLUMN f2 int;
+INSERT INTO test_type_diff_c VALUES (1, 2, 3);
+ALTER TABLE test_type_diff ALTER COLUMN f2 TYPE bigint USING f2::bigint;
+CREATE TABLE test_type_diff2 (int_two int2, int_four int4, int_eight int8);
+CREATE TABLE test_type_diff2_c1 (int_four int4, int_eight int8, int_two int2);
+CREATE TABLE test_type_diff2_c2 (int_eight int8, int_two int2, int_four int4);
+CREATE TABLE test_type_diff2_c3 (int_two int2, int_four int4, int_eight int8);
+ALTER TABLE test_type_diff2_c1 INHERIT test_type_diff2;
+ALTER TABLE test_type_diff2_c2 INHERIT test_type_diff2;
+ALTER TABLE test_type_diff2_c3 INHERIT test_type_diff2;
+INSERT INTO test_type_diff2_c1 VALUES (1, 2, 3);
+INSERT INTO test_type_diff2_c2 VALUES (4, 5, 6);
+INSERT INTO test_type_diff2_c3 VALUES (7, 8, 9);
+ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int8 USING int_four::int8;
+-- whole-row references are disallowed
+ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int4 USING (pg_column_size(test_type_diff2));
+ERROR:  cannot convert whole-row table reference
+DETAIL:  USING expression contains a whole-row table reference.
+-- check for rollback of ANALYZE corrupting table property flags (bug #11638)
+CREATE TABLE check_fk_presence_1 (id int PRIMARY KEY, t text);
+CREATE TABLE check_fk_presence_2 (id int REFERENCES check_fk_presence_1, t text);
+BEGIN;
+ALTER TABLE check_fk_presence_2 DROP CONSTRAINT check_fk_presence_2_id_fkey;
+ANALYZE check_fk_presence_2;
+ROLLBACK;
+\d check_fk_presence_2
+        Table "public.check_fk_presence_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ t      | text    |           |          | 
+Foreign-key constraints:
+    "check_fk_presence_2_id_fkey" FOREIGN KEY (id) REFERENCES check_fk_presence_1(id)
+
+DROP TABLE check_fk_presence_1, check_fk_presence_2;
+-- check column addition within a view (bug #14876)
+create table at_base_table(id int, stuff text);
+insert into at_base_table values (23, 'skidoo');
+create view at_view_1 as select * from at_base_table bt;
+create view at_view_2 as select *, to_json(v1) as j from at_view_1 v1;
+\d+ at_view_1
+                          View "public.at_view_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+View definition:
+ SELECT bt.id,
+    bt.stuff
+   FROM at_base_table bt;
+
+\d+ at_view_2
+                          View "public.at_view_2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ j      | json    |           |          |         | extended | 
+View definition:
+ SELECT v1.id,
+    v1.stuff,
+    to_json(v1.*) AS j
+   FROM at_view_1 v1;
+
+explain (verbose, costs off) select * from at_view_2;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Seq Scan on public.at_base_table bt
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff))
+(2 rows)
+
+select * from at_view_2;
+ id | stuff  |             j              
+----+--------+----------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo"}
+(1 row)
+
+create or replace view at_view_1 as select *, 2+2 as more from at_base_table bt;
+\d+ at_view_1
+                          View "public.at_view_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ more   | integer |           |          |         | plain    | 
+View definition:
+ SELECT bt.id,
+    bt.stuff,
+    2 + 2 AS more
+   FROM at_base_table bt;
+
+\d+ at_view_2
+                          View "public.at_view_2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ j      | json    |           |          |         | extended | 
+View definition:
+ SELECT v1.id,
+    v1.stuff,
+    to_json(v1.*) AS j
+   FROM at_view_1 v1;
+
+explain (verbose, costs off) select * from at_view_2;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Seq Scan on public.at_base_table bt
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL))
+(2 rows)
+
+select * from at_view_2;
+ id | stuff  |                   j                    
+----+--------+----------------------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo","more":null}
+(1 row)
+
+drop view at_view_2;
+drop view at_view_1;
+drop table at_base_table;
+-- check adding a column not iself requiring a rewrite, together with
+-- a column requiring a default (bug #16038)
+-- ensure that rewrites aren't silently optimized away, removing the
+-- value of the test
+CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text)
+RETURNS boolean
+LANGUAGE plpgsql AS $$
+DECLARE
+    v_relfilenode oid;
+BEGIN
+    v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename;
+
+    EXECUTE p_ddl;
+
+    RETURN v_relfilenode <> (SELECT relfilenode FROM pg_class WHERE oid = p_tablename);
+END;
+$$;
+CREATE TABLE rewrite_test(col text);
+INSERT INTO rewrite_test VALUES ('something');
+INSERT INTO rewrite_test VALUES (NULL);
+-- empty[12] don't need rewrite, but notempty[12]_rewrite will force one
+SELECT check_ddl_rewrite('rewrite_test', $$
+  ALTER TABLE rewrite_test
+      ADD COLUMN empty1 text,
+      ADD COLUMN notempty1_rewrite serial;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN notempty2_rewrite serial,
+        ADD COLUMN empty2 text;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+-- also check that fast defaults cause no problem, first without rewrite
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN empty3 text,
+        ADD COLUMN notempty3_norewrite int default 42;
+$$);
+ check_ddl_rewrite 
+-------------------
+ f
+(1 row)
+
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN notempty4_norewrite int default 42,
+        ADD COLUMN empty4 text;
+$$);
+ check_ddl_rewrite 
+-------------------
+ f
+(1 row)
+
+-- then with rewrite
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN empty5 text,
+        ADD COLUMN notempty5_norewrite int default 42,
+        ADD COLUMN notempty5_rewrite serial;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN notempty6_rewrite serial,
+        ADD COLUMN empty6 text,
+        ADD COLUMN notempty6_norewrite int default 42;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+-- cleanup
+DROP FUNCTION check_ddl_rewrite(regclass, text);
+DROP TABLE rewrite_test;
+--
+-- lock levels
+--
+drop type lockmodes;
+ERROR:  type "lockmodes" does not exist
+create type lockmodes as enum (
+ 'SIReadLock'
+,'AccessShareLock'
+,'RowShareLock'
+,'RowExclusiveLock'
+,'ShareUpdateExclusiveLock'
+,'ShareLock'
+,'ShareRowExclusiveLock'
+,'ExclusiveLock'
+,'AccessExclusiveLock'
+);
+drop view my_locks;
+ERROR:  view "my_locks" does not exist
+create or replace view my_locks as
+select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode
+from pg_locks l join pg_class c on l.relation = c.oid
+where virtualtransaction = (
+        select virtualtransaction
+        from pg_locks
+        where transactionid = pg_current_xact_id()::xid)
+and locktype = 'relation'
+and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog')
+and c.relname != 'my_locks'
+group by c.relname;
+create table alterlock (f1 int primary key, f2 text);
+insert into alterlock values (1, 'foo');
+create table alterlock2 (f3 int primary key, f1 int);
+insert into alterlock2 values (1, 1);
+begin; alter table alterlock alter column f2 set statistics 150;
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+rollback;
+begin; alter table alterlock cluster on alterlock_pkey;
+select * from my_locks order by 1;
+    relname     |       max_lockmode       
+----------------+--------------------------
+ alterlock      | ShareUpdateExclusiveLock
+ alterlock_pkey | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set without cluster;
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock set (fillfactor = 100);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock reset (fillfactor);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set (toast.autovacuum_enabled = off);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set (autovacuum_enabled = off);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock alter column f2 set (n_distinct = 1);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+rollback;
+-- test that mixing options with different lock levels works as expected
+begin; alter table alterlock set (autovacuum_enabled = off, fillfactor = 80);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock alter column f2 set storage extended;
+select * from my_locks order by 1;
+  relname  |    max_lockmode     
+-----------+---------------------
+ alterlock | AccessExclusiveLock
+(1 row)
+
+rollback;
+begin; alter table alterlock alter column f2 set default 'x';
+select * from my_locks order by 1;
+  relname  |    max_lockmode     
+-----------+---------------------
+ alterlock | AccessExclusiveLock
+(1 row)
+
+rollback;
+begin;
+create trigger ttdummy
+	before delete or update on alterlock
+	for each row
+	execute procedure
+	ttdummy (1, 1);
+select * from my_locks order by 1;
+  relname  |     max_lockmode      
+-----------+-----------------------
+ alterlock | ShareRowExclusiveLock
+(1 row)
+
+rollback;
+begin;
+select * from my_locks order by 1;
+ relname | max_lockmode 
+---------+--------------
+(0 rows)
+
+alter table alterlock2 add foreign key (f1) references alterlock (f1);
+select * from my_locks order by 1;
+     relname     |     max_lockmode      
+-----------------+-----------------------
+ alterlock       | ShareRowExclusiveLock
+ alterlock2      | ShareRowExclusiveLock
+ alterlock2_pkey | AccessShareLock
+ alterlock_pkey  | AccessShareLock
+(4 rows)
+
+rollback;
+begin;
+alter table alterlock2
+add constraint alterlock2nv foreign key (f1) references alterlock (f1) NOT VALID;
+select * from my_locks order by 1;
+  relname   |     max_lockmode      
+------------+-----------------------
+ alterlock  | ShareRowExclusiveLock
+ alterlock2 | ShareRowExclusiveLock
+(2 rows)
+
+commit;
+begin;
+alter table alterlock2 validate constraint alterlock2nv;
+select * from my_locks order by 1;
+     relname     |       max_lockmode       
+-----------------+--------------------------
+ alterlock       | RowShareLock
+ alterlock2      | ShareUpdateExclusiveLock
+ alterlock2_pkey | AccessShareLock
+ alterlock_pkey  | AccessShareLock
+(4 rows)
+
+rollback;
+create or replace view my_locks as
+select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode
+from pg_locks l join pg_class c on l.relation = c.oid
+where virtualtransaction = (
+        select virtualtransaction
+        from pg_locks
+        where transactionid = pg_current_xact_id()::xid)
+and locktype = 'relation'
+and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog')
+and c.relname = 'my_locks'
+group by c.relname;
+-- raise exception
+alter table my_locks set (autovacuum_enabled = false);
+ERROR:  unrecognized parameter "autovacuum_enabled"
+alter view my_locks set (autovacuum_enabled = false);
+ERROR:  unrecognized parameter "autovacuum_enabled"
+alter table my_locks reset (autovacuum_enabled);
+alter view my_locks reset (autovacuum_enabled);
+begin;
+alter view my_locks set (security_barrier=off);
+select * from my_locks order by 1;
+ relname  |    max_lockmode     
+----------+---------------------
+ my_locks | AccessExclusiveLock
+(1 row)
+
+alter view my_locks reset (security_barrier);
+rollback;
+-- this test intentionally applies the ALTER TABLE command against a view, but
+-- uses a view option so we expect this to succeed. This form of SQL is
+-- accepted for historical reasons, as shown in the docs for ALTER VIEW
+begin;
+alter table my_locks set (security_barrier=off);
+select * from my_locks order by 1;
+ relname  |    max_lockmode     
+----------+---------------------
+ my_locks | AccessExclusiveLock
+(1 row)
+
+alter table my_locks reset (security_barrier);
+rollback;
+-- cleanup
+drop table alterlock2;
+drop table alterlock;
+drop view my_locks;
+drop type lockmodes;
+--
+-- alter function
+--
+create function test_strict(text) returns text as
+    'select coalesce($1, ''got passed a null'');'
+    language sql returns null on null input;
+select test_strict(NULL);
+ test_strict 
+-------------
+ 
+(1 row)
+
+alter function test_strict(text) called on null input;
+select test_strict(NULL);
+    test_strict    
+-------------------
+ got passed a null
+(1 row)
+
+create function non_strict(text) returns text as
+    'select coalesce($1, ''got passed a null'');'
+    language sql called on null input;
+select non_strict(NULL);
+    non_strict     
+-------------------
+ got passed a null
+(1 row)
+
+alter function non_strict(text) returns null on null input;
+select non_strict(NULL);
+ non_strict 
+------------
+ 
+(1 row)
+
+--
+-- alter object set schema
+--
+create schema alter1;
+create schema alter2;
+create table alter1.t1(f1 serial primary key, f2 int check (f2 > 0));
+create view alter1.v1 as select * from alter1.t1;
+create function alter1.plus1(int) returns int as 'select $1+1' language sql;
+create domain alter1.posint integer check (value > 0);
+create type alter1.ctype as (f1 int, f2 text);
+create function alter1.same(alter1.ctype, alter1.ctype) returns boolean language sql
+as 'select $1.f1 is not distinct from $2.f1 and $1.f2 is not distinct from $2.f2';
+create operator alter1.=(procedure = alter1.same, leftarg  = alter1.ctype, rightarg = alter1.ctype);
+create operator class alter1.ctype_hash_ops default for type alter1.ctype using hash as
+  operator 1 alter1.=(alter1.ctype, alter1.ctype);
+create conversion alter1.latin1_to_utf8 for 'latin1' to 'utf8' from iso8859_1_to_utf8;
+create text search parser alter1.prs(start = prsd_start, gettoken = prsd_nexttoken, end = prsd_end, lextypes = prsd_lextype);
+create text search configuration alter1.cfg(parser = alter1.prs);
+create text search template alter1.tmpl(init = dsimple_init, lexize = dsimple_lexize);
+create text search dictionary alter1.dict(template = alter1.tmpl);
+insert into alter1.t1(f2) values(11);
+insert into alter1.t1(f2) values(12);
+alter table alter1.t1 set schema alter1; -- no-op, same schema
+alter table alter1.t1 set schema alter2;
+alter table alter1.v1 set schema alter2;
+alter function alter1.plus1(int) set schema alter2;
+alter domain alter1.posint set schema alter2;
+alter operator class alter1.ctype_hash_ops using hash set schema alter2;
+alter operator family alter1.ctype_hash_ops using hash set schema alter2;
+alter operator alter1.=(alter1.ctype, alter1.ctype) set schema alter2;
+alter function alter1.same(alter1.ctype, alter1.ctype) set schema alter2;
+alter type alter1.ctype set schema alter1; -- no-op, same schema
+alter type alter1.ctype set schema alter2;
+alter conversion alter1.latin1_to_utf8 set schema alter2;
+alter text search parser alter1.prs set schema alter2;
+alter text search configuration alter1.cfg set schema alter2;
+alter text search template alter1.tmpl set schema alter2;
+alter text search dictionary alter1.dict set schema alter2;
+-- this should succeed because nothing is left in alter1
+drop schema alter1;
+insert into alter2.t1(f2) values(13);
+insert into alter2.t1(f2) values(14);
+select * from alter2.t1;
+ f1 | f2 
+----+----
+  1 | 11
+  2 | 12
+  3 | 13
+  4 | 14
+(4 rows)
+
+select * from alter2.v1;
+ f1 | f2 
+----+----
+  1 | 11
+  2 | 12
+  3 | 13
+  4 | 14
+(4 rows)
+
+select alter2.plus1(41);
+ plus1 
+-------
+    42
+(1 row)
+
+-- clean up
+drop schema alter2 cascade;
+NOTICE:  drop cascades to 13 other objects
+DETAIL:  drop cascades to table alter2.t1
+drop cascades to view alter2.v1
+drop cascades to function alter2.plus1(integer)
+drop cascades to type alter2.posint
+drop cascades to type alter2.ctype
+drop cascades to function alter2.same(alter2.ctype,alter2.ctype)
+drop cascades to operator alter2.=(alter2.ctype,alter2.ctype)
+drop cascades to operator family alter2.ctype_hash_ops for access method hash
+drop cascades to conversion alter2.latin1_to_utf8
+drop cascades to text search parser alter2.prs
+drop cascades to text search configuration alter2.cfg
+drop cascades to text search template alter2.tmpl
+drop cascades to text search dictionary alter2.dict
+--
+-- composite types
+--
+CREATE TYPE test_type AS (a int);
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+
+ALTER TYPE nosuchtype ADD ATTRIBUTE b text; -- fails
+ERROR:  relation "nosuchtype" does not exist
+ALTER TYPE test_type ADD ATTRIBUTE b text;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+
+ALTER TYPE test_type ADD ATTRIBUTE b text; -- fails
+ERROR:  column "b" of relation "test_type" already exists
+ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE varchar;
+\d test_type
+              Composite type "public.test_type"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+
+ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE integer;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+
+ALTER TYPE test_type DROP ATTRIBUTE b;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+
+ALTER TYPE test_type DROP ATTRIBUTE c; -- fails
+ERROR:  column "c" of relation "test_type" does not exist
+ALTER TYPE test_type DROP ATTRIBUTE IF EXISTS c;
+NOTICE:  column "c" of relation "test_type" does not exist, skipping
+ALTER TYPE test_type DROP ATTRIBUTE a, ADD ATTRIBUTE d boolean;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ d      | boolean |           |          | 
+
+ALTER TYPE test_type RENAME ATTRIBUTE a TO aa;
+ERROR:  column "a" does not exist
+ALTER TYPE test_type RENAME ATTRIBUTE d TO dd;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ dd     | boolean |           |          | 
+
+DROP TYPE test_type;
+CREATE TYPE test_type1 AS (a int, b text);
+CREATE TABLE test_tbl1 (x int, y test_type1);
+ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type1" because column "test_tbl1.y" uses it
+CREATE TYPE test_type2 AS (a int, b text);
+CREATE TABLE test_tbl2 OF test_type2;
+CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2);
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 ADD ATTRIBUTE c text; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 ADD ATTRIBUTE c text CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar CASCADE;
+\d test_type2
+             Composite type "public.test_type2"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+ c      | text              |           |          | 
+
+\d test_tbl2
+                  Table "public.test_tbl2"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+ c      | text              |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 DROP ATTRIBUTE b; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 DROP ATTRIBUTE b CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+\d test_tbl2_subclass
+         Table "public.test_tbl2_subclass"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+Inherits: test_tbl2
+
+DROP TABLE test_tbl2_subclass;
+CREATE TYPE test_typex AS (a int, b text);
+CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0));
+ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails
+ERROR:  cannot drop column a of composite type test_typex because other objects depend on it
+DETAIL:  constraint test_tblx_y_check on table test_tblx depends on column a of composite type test_typex
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+ALTER TYPE test_typex DROP ATTRIBUTE a CASCADE;
+NOTICE:  drop cascades to constraint test_tblx_y_check on table test_tblx
+\d test_tblx
+               Table "public.test_tblx"
+ Column |    Type    | Collation | Nullable | Default 
+--------+------------+-----------+----------+---------
+ x      | integer    |           |          | 
+ y      | test_typex |           |          | 
+
+DROP TABLE test_tblx;
+DROP TYPE test_typex;
+-- This test isn't that interesting on its own, but the purpose is to leave
+-- behind a table to test pg_upgrade with. The table has a composite type
+-- column in it, and the composite type has a dropped attribute.
+CREATE TYPE test_type3 AS (a int);
+CREATE TABLE test_tbl3 (c) AS SELECT '(1)'::test_type3;
+ALTER TYPE test_type3 DROP ATTRIBUTE a, ADD ATTRIBUTE b int;
+CREATE TYPE test_type_empty AS ();
+DROP TYPE test_type_empty;
+--
+-- typed tables: OF / NOT OF
+--
+CREATE TYPE tt_t0 AS (z inet, x int, y numeric(8,2));
+ALTER TYPE tt_t0 DROP ATTRIBUTE z;
+CREATE TABLE tt0 (x int NOT NULL, y numeric(8,2));	-- OK
+CREATE TABLE tt1 (x int, y bigint);					-- wrong base type
+CREATE TABLE tt2 (x int, y numeric(9,2));			-- wrong typmod
+CREATE TABLE tt3 (y numeric(8,2), x int);			-- wrong column order
+CREATE TABLE tt4 (x int);							-- too few columns
+CREATE TABLE tt5 (x int, y numeric(8,2), z int);	-- too few columns
+CREATE TABLE tt6 () INHERITS (tt0);					-- can't have a parent
+CREATE TABLE tt7 (x int, q text, y numeric(8,2));
+ALTER TABLE tt7 DROP q;								-- OK
+ALTER TABLE tt0 OF tt_t0;
+ALTER TABLE tt1 OF tt_t0;
+ERROR:  table "tt1" has different type for column "y"
+ALTER TABLE tt2 OF tt_t0;
+ERROR:  table "tt2" has different type for column "y"
+ALTER TABLE tt3 OF tt_t0;
+ERROR:  table has column "y" where type requires "x"
+ALTER TABLE tt4 OF tt_t0;
+ERROR:  table is missing column "y"
+ALTER TABLE tt5 OF tt_t0;
+ERROR:  table has extra column "z"
+ALTER TABLE tt6 OF tt_t0;
+ERROR:  typed tables cannot inherit
+ALTER TABLE tt7 OF tt_t0;
+CREATE TYPE tt_t1 AS (x int, y numeric(8,2));
+ALTER TABLE tt7 OF tt_t1;			-- reassign an already-typed table
+ALTER TABLE tt7 NOT OF;
+\d tt7
+                   Table "public.tt7"
+ Column |     Type     | Collation | Nullable | Default 
+--------+--------------+-----------+----------+---------
+ x      | integer      |           |          | 
+ y      | numeric(8,2) |           |          | 
+
+-- make sure we can drop a constraint on the parent but it remains on the child
+CREATE TABLE test_drop_constr_parent (c text CHECK (c IS NOT NULL));
+CREATE TABLE test_drop_constr_child () INHERITS (test_drop_constr_parent);
+ALTER TABLE ONLY test_drop_constr_parent DROP CONSTRAINT "test_drop_constr_parent_c_check";
+-- should fail
+INSERT INTO test_drop_constr_child (c) VALUES (NULL);
+ERROR:  new row for relation "test_drop_constr_child" violates check constraint "test_drop_constr_parent_c_check"
+DETAIL:  Failing row contains (null).
+DROP TABLE test_drop_constr_parent CASCADE;
+NOTICE:  drop cascades to table test_drop_constr_child
+--
+-- IF EXISTS test
+--
+ALTER TABLE IF EXISTS tt8 ADD COLUMN f int;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f);
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10);
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2;
+NOTICE:  relation "tt8" does not exist, skipping
+CREATE TABLE tt8(a int);
+CREATE SCHEMA alter2;
+ALTER TABLE IF EXISTS tt8 ADD COLUMN f int;
+ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f);
+ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10);
+ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0;
+ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1;
+ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2;
+\d alter2.tt8
+                Table "alter2.tt8"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ f1     | integer |           | not null | 0
+Indexes:
+    "xxx" PRIMARY KEY, btree (f1)
+Check constraints:
+    "tt8_f_check" CHECK (f1 >= 0 AND f1 <= 10)
+
+DROP TABLE alter2.tt8;
+DROP SCHEMA alter2;
+--
+-- Check conflicts between index and CHECK constraint names
+--
+CREATE TABLE tt9(c integer);
+ALTER TABLE tt9 ADD CHECK(c > 1);
+ALTER TABLE tt9 ADD CHECK(c > 2);  -- picks nonconflicting name
+ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 3);
+ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 4);  -- fail, dup name
+ERROR:  constraint "foo" for relation "tt9" already exists
+ALTER TABLE tt9 ADD UNIQUE(c);
+ALTER TABLE tt9 ADD UNIQUE(c);  -- picks nonconflicting name
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key UNIQUE(c);  -- fail, dup name
+ERROR:  relation "tt9_c_key" already exists
+ALTER TABLE tt9 ADD CONSTRAINT foo UNIQUE(c);  -- fail, dup name
+ERROR:  constraint "foo" for relation "tt9" already exists
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key CHECK(c > 5);  -- fail, dup name
+ERROR:  constraint "tt9_c_key" for relation "tt9" already exists
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key2 CHECK(c > 6);
+ALTER TABLE tt9 ADD UNIQUE(c);  -- picks nonconflicting name
+\d tt9
+                Table "public.tt9"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c      | integer |           |          | 
+Indexes:
+    "tt9_c_key" UNIQUE CONSTRAINT, btree (c)
+    "tt9_c_key1" UNIQUE CONSTRAINT, btree (c)
+    "tt9_c_key3" UNIQUE CONSTRAINT, btree (c)
+Check constraints:
+    "foo" CHECK (c > 3)
+    "tt9_c_check" CHECK (c > 1)
+    "tt9_c_check1" CHECK (c > 2)
+    "tt9_c_key2" CHECK (c > 6)
+
+DROP TABLE tt9;
+-- Check that comments on constraints and indexes are not lost at ALTER TABLE.
+CREATE TABLE comment_test (
+  id int,
+  positive_col int CHECK (positive_col > 0),
+  indexed_col int,
+  CONSTRAINT comment_test_pk PRIMARY KEY (id));
+CREATE INDEX comment_test_index ON comment_test(indexed_col);
+COMMENT ON COLUMN comment_test.id IS 'Column ''id'' on comment_test';
+COMMENT ON INDEX comment_test_index IS 'Simple index on comment_test';
+COMMENT ON CONSTRAINT comment_test_positive_col_check ON comment_test IS 'CHECK constraint on comment_test.positive_col';
+COMMENT ON CONSTRAINT comment_test_pk ON comment_test IS 'PRIMARY KEY constraint of comment_test';
+COMMENT ON INDEX comment_test_pk IS 'Index backing the PRIMARY KEY of comment_test';
+SELECT col_description('comment_test'::regclass, 1) as comment;
+           comment           
+-----------------------------
+ Column 'id' on comment_test
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2;
+       index        |                    comment                    
+--------------------+-----------------------------------------------
+ comment_test_index | Simple index on comment_test
+ comment_test_pk    | Index backing the PRIMARY KEY of comment_test
+(2 rows)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2;
+           constraint            |                    comment                    
+---------------------------------+-----------------------------------------------
+ comment_test_pk                 | PRIMARY KEY constraint of comment_test
+ comment_test_positive_col_check | CHECK constraint on comment_test.positive_col
+(2 rows)
+
+-- Change the datatype of all the columns. ALTER TABLE is optimized to not
+-- rebuild an index if the new data type is binary compatible with the old
+-- one. Check do a dummy ALTER TABLE that doesn't change the datatype
+-- first, to test that no-op codepath, and another one that does.
+ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE bigint;
+-- Check that the comments are intact.
+SELECT col_description('comment_test'::regclass, 1) as comment;
+           comment           
+-----------------------------
+ Column 'id' on comment_test
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2;
+       index        |                    comment                    
+--------------------+-----------------------------------------------
+ comment_test_index | Simple index on comment_test
+ comment_test_pk    | Index backing the PRIMARY KEY of comment_test
+(2 rows)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2;
+           constraint            |                    comment                    
+---------------------------------+-----------------------------------------------
+ comment_test_pk                 | PRIMARY KEY constraint of comment_test
+ comment_test_positive_col_check | CHECK constraint on comment_test.positive_col
+(2 rows)
+
+-- Check compatibility for foreign keys and comments. This is done
+-- separately as rebuilding the column type of the parent leads
+-- to an error and would reduce the test scope.
+CREATE TABLE comment_test_child (
+  id text CONSTRAINT comment_test_child_fk REFERENCES comment_test);
+CREATE INDEX comment_test_child_fk ON comment_test_child(id);
+COMMENT ON COLUMN comment_test_child.id IS 'Column ''id'' on comment_test_child';
+COMMENT ON INDEX comment_test_child_fk IS 'Index backing the FOREIGN KEY of comment_test_child';
+COMMENT ON CONSTRAINT comment_test_child_fk ON comment_test_child IS 'FOREIGN KEY constraint of comment_test_child';
+-- Change column type of parent
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int USING id::integer;
+ERROR:  foreign key constraint "comment_test_child_fk" cannot be implemented
+DETAIL:  Key columns "id" and "id" are of incompatible types: text and integer.
+-- Comments should be intact
+SELECT col_description('comment_test_child'::regclass, 1) as comment;
+              comment              
+-----------------------------------
+ Column 'id' on comment_test_child
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test_child'::regclass ORDER BY 1, 2;
+         index         |                       comment                       
+-----------------------+-----------------------------------------------------
+ comment_test_child_fk | Index backing the FOREIGN KEY of comment_test_child
+(1 row)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test_child'::regclass ORDER BY 1, 2;
+      constraint       |                   comment                    
+-----------------------+----------------------------------------------
+ comment_test_child_fk | FOREIGN KEY constraint of comment_test_child
+(1 row)
+
+-- Check that we map relation oids to filenodes and back correctly.  Only
+-- display bad mappings so the test output doesn't change all the time.  A
+-- filenode function call can return NULL for a relation dropped concurrently
+-- with the call's surrounding query, so ignore a NULL mapped_oid for
+-- relations that no longer exist after all calls finish.
+CREATE TEMP TABLE filenode_mapping AS
+SELECT
+    oid, mapped_oid, reltablespace, relfilenode, relname
+FROM pg_class,
+    pg_filenode_relation(reltablespace, pg_relation_filenode(oid)) AS mapped_oid
+WHERE relkind IN ('r', 'i', 'S', 't', 'm') AND mapped_oid IS DISTINCT FROM oid;
+SELECT m.* FROM filenode_mapping m LEFT JOIN pg_class c ON c.oid = m.oid
+WHERE c.oid IS NOT NULL OR m.mapped_oid IS NOT NULL;
+ oid | mapped_oid | reltablespace | relfilenode | relname 
+-----+------------+---------------+-------------+---------
+(0 rows)
+
+-- Checks on creating and manipulation of user defined relations in
+-- pg_catalog.
+SHOW allow_system_table_mods;
+ allow_system_table_mods 
+-------------------------
+ off
+(1 row)
+
+-- disallowed because of search_path issues with pg_dump
+CREATE TABLE pg_catalog.new_system_table();
+ERROR:  permission denied to create "pg_catalog.new_system_table"
+DETAIL:  System catalog modifications are currently disallowed.
+-- instead create in public first, move to catalog
+CREATE TABLE new_system_table(id serial primary key, othercol text);
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+ALTER TABLE new_system_table SET SCHEMA public;
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+-- will be ignored -- already there:
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+ALTER TABLE new_system_table RENAME TO old_system_table;
+CREATE INDEX old_system_table__othercol ON old_system_table (othercol);
+INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata');
+UPDATE old_system_table SET id = -id;
+DELETE FROM old_system_table WHERE othercol = 'somedata';
+TRUNCATE old_system_table;
+ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey;
+ALTER TABLE old_system_table DROP COLUMN othercol;
+DROP TABLE old_system_table;
+-- set logged
+CREATE UNLOGGED TABLE unlogged1(f1 SERIAL PRIMARY KEY, f2 TEXT);
+-- check relpersistence of an unlogged table
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1'
+ORDER BY relname;
+     relname      | relkind | relpersistence 
+------------------+---------+----------------
+ toast index      | i       | p
+ toast table      | t       | p
+ unlogged1        | r       | p
+ unlogged1_f1_seq | S       | p
+ unlogged1_pkey   | i       | p
+(5 rows)
+
+CREATE UNLOGGED TABLE unlogged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged1); -- foreign key
+CREATE UNLOGGED TABLE unlogged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged3); -- self-referencing foreign key
+ALTER TABLE unlogged3 SET LOGGED; -- skip self-referencing foreign key
+ALTER TABLE unlogged2 SET LOGGED; -- fails because a foreign key to an unlogged table exists
+ALTER TABLE unlogged1 SET LOGGED;
+-- check relpersistence of an unlogged table after changing to permanent
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1'
+ORDER BY relname;
+     relname      | relkind | relpersistence 
+------------------+---------+----------------
+ toast index      | i       | p
+ toast table      | t       | p
+ unlogged1        | r       | p
+ unlogged1_f1_seq | S       | p
+ unlogged1_pkey   | i       | p
+(5 rows)
+
+ALTER TABLE unlogged1 SET LOGGED; -- silently do nothing
+DROP TABLE unlogged3;
+DROP TABLE unlogged2;
+DROP TABLE unlogged1;
+-- set unlogged
+CREATE TABLE logged1(f1 SERIAL PRIMARY KEY, f2 TEXT);
+-- check relpersistence of a permanent table
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1'
+ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ logged1        | r       | p
+ logged1_f1_seq | S       | p
+ logged1_pkey   | i       | p
+ toast index    | i       | p
+ toast table    | t       | p
+(5 rows)
+
+CREATE TABLE logged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged1); -- foreign key
+CREATE TABLE logged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged3); -- self-referencing foreign key
+ALTER TABLE logged1 SET UNLOGGED; -- fails because a foreign key from a permanent table exists
+ERROR:  could not change table "logged1" to unlogged because it references logged table "logged2"
+ALTER TABLE logged3 SET UNLOGGED; -- skip self-referencing foreign key
+ALTER TABLE logged2 SET UNLOGGED;
+ALTER TABLE logged1 SET UNLOGGED;
+-- check relpersistence of a permanent table after changing to unlogged
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1'
+ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ logged1        | r       | u
+ logged1_f1_seq | S       | p
+ logged1_pkey   | i       | u
+ toast index    | i       | u
+ toast table    | t       | u
+(5 rows)
+
+ALTER TABLE logged1 SET UNLOGGED; -- silently do nothing
+DROP TABLE logged3;
+DROP TABLE logged2;
+DROP TABLE logged1;
+-- test ADD COLUMN IF NOT EXISTS
+CREATE TABLE test_add_column(c1 integer);
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer;
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer; -- fail because c2 already exists
+ERROR:  column "c2" of relation "test_add_column" already exists
+ALTER TABLE ONLY test_add_column
+	ADD COLUMN c2 integer; -- fail because c2 already exists
+ERROR:  column "c2" of relation "test_add_column" already exists
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+ALTER TABLE ONLY test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer, -- fail because c2 already exists
+	ADD COLUMN c3 integer primary key;
+ERROR:  column "c2" of relation "test_add_column" already exists
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN c3 integer primary key;
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN IF NOT EXISTS c3 integer primary key; -- skipping because c3 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+NOTICE:  column "c3" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN IF NOT EXISTS c3 integer, -- skipping because c3 already exists
+	ADD COLUMN c4 integer REFERENCES test_add_column;
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+NOTICE:  column "c3" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c4 integer REFERENCES test_add_column;
+NOTICE:  column "c4" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 8);
+\d test_add_column
+                            Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable |                   Default                   
+--------+---------+-----------+----------+---------------------------------------------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+ c5     | integer |           | not null | nextval('test_add_column_c5_seq'::regclass)
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Check constraints:
+    "test_add_column_c5_check" CHECK (c5 > 8)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 10);
+NOTICE:  column "c5" of relation "test_add_column" already exists, skipping
+\d test_add_column*
+                            Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable |                   Default                   
+--------+---------+-----------+----------+---------------------------------------------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+ c5     | integer |           | not null | nextval('test_add_column_c5_seq'::regclass)
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Check constraints:
+    "test_add_column_c5_check" CHECK (c5 > 8)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+               Sequence "public.test_add_column_c5_seq"
+  Type   | Start | Minimum |  Maximum   | Increment | Cycles? | Cache 
+---------+-------+---------+------------+-----------+---------+-------
+ integer |     1 |       1 | 2147483647 |         1 | no      |     1
+Owned by: public.test_add_column.c5
+
+ Index "public.test_add_column_pkey"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ c3     | integer | yes  | c3
+primary key, btree, for table "public.test_add_column"
+
+DROP TABLE test_add_column;
+\d test_add_column*
+-- assorted cases with multiple ALTER TABLE steps
+CREATE TABLE ataddindex(f1 INT);
+INSERT INTO ataddindex VALUES (42), (43);
+CREATE UNIQUE INDEX ataddindexi0 ON ataddindex(f1);
+ALTER TABLE ataddindex
+  ADD PRIMARY KEY USING INDEX ataddindexi0,
+  ALTER f1 TYPE BIGINT;
+\d ataddindex
+            Table "public.ataddindex"
+ Column |  Type  | Collation | Nullable | Default 
+--------+--------+-----------+----------+---------
+ f1     | bigint |           | not null | 
+Indexes:
+    "ataddindexi0" PRIMARY KEY, btree (f1)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(f1 VARCHAR(10));
+INSERT INTO ataddindex(f1) VALUES ('foo'), ('a');
+ALTER TABLE ataddindex
+  ALTER f1 SET DATA TYPE TEXT,
+  ADD EXCLUDE ((f1 LIKE 'a') WITH =);
+\d ataddindex
+           Table "public.ataddindex"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ f1     | text |           |          | 
+Indexes:
+    "ataddindex_expr_excl" EXCLUDE USING btree ((f1 ~~ 'a'::text) WITH =)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+  ADD PRIMARY KEY (id),
+  ADD FOREIGN KEY (ref_id) REFERENCES ataddindex;
+\d ataddindex
+             Table "public.ataddindex"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           | not null | 
+ ref_id | integer |           |          | 
+Indexes:
+    "ataddindex_pkey" PRIMARY KEY, btree (id)
+Foreign-key constraints:
+    "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+Referenced by:
+    TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+  ADD UNIQUE (id),
+  ADD FOREIGN KEY (ref_id) REFERENCES ataddindex (id);
+\d ataddindex
+             Table "public.ataddindex"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ ref_id | integer |           |          | 
+Indexes:
+    "ataddindex_id_key" UNIQUE CONSTRAINT, btree (id)
+Foreign-key constraints:
+    "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+Referenced by:
+    TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+
+DROP TABLE ataddindex;
+-- unsupported constraint types for partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (a, (a+b+1));
+ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
+                                    ^
+-- cannot drop column that is part of the partition key
+ALTER TABLE partitioned DROP COLUMN a;
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
+ERROR:  cannot alter column "a" because it is part of the partition key of relation "partitioned"
+ALTER TABLE partitioned DROP COLUMN b;
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "partitioned"
+ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "partitioned"
+-- partitioned table cannot participate in regular inheritance
+CREATE TABLE nonpartitioned (
+	a int,
+	b int
+);
+ALTER TABLE partitioned INHERIT nonpartitioned;
+ERROR:  cannot change inheritance of partitioned table
+ALTER TABLE nonpartitioned INHERIT partitioned;
+ERROR:  cannot inherit from partitioned table "partitioned"
+-- cannot add NO INHERIT constraint to partitioned tables
+ALTER TABLE partitioned ADD CONSTRAINT chk_a CHECK (a > 0) NO INHERIT;
+ERROR:  cannot add NO INHERIT constraint to partitioned table "partitioned"
+DROP TABLE partitioned, nonpartitioned;
+--
+-- ATTACH PARTITION
+--
+-- check that target table is partitioned
+CREATE TABLE unparted (
+	a int
+);
+CREATE TABLE fail_part (like unparted);
+ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a');
+ERROR:  table "unparted" is not partitioned
+DROP TABLE unparted, fail_part;
+-- check that partition bound is compatible
+CREATE TABLE list_parted (
+	a int NOT NULL,
+	b char(2) COLLATE "C",
+	CONSTRAINT check_a CHECK (a > 0)
+) PARTITION BY LIST (a);
+CREATE TABLE fail_part (LIKE list_parted);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) TO (10);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) T...
+                                                             ^
+DROP TABLE fail_part;
+-- check that the table being attached exists
+ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ERROR:  relation "nonexistent" does not exist
+-- check ownership of the source table
+CREATE ROLE regress_test_me;
+CREATE ROLE regress_test_not_me;
+CREATE TABLE not_owned_by_me (LIKE list_parted);
+ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+SET SESSION AUTHORIZATION regress_test_me;
+CREATE TABLE owned_by_me (
+	a int
+) PARTITION BY LIST (a);
+ALTER TABLE owned_by_me ATTACH PARTITION not_owned_by_me FOR VALUES IN (1);
+ERROR:  must be owner of table not_owned_by_me
+RESET SESSION AUTHORIZATION;
+DROP TABLE owned_by_me, not_owned_by_me;
+DROP ROLE regress_test_not_me;
+DROP ROLE regress_test_me;
+-- check that the table being attached is not part of regular inheritance
+CREATE TABLE parent (LIKE list_parted);
+CREATE TABLE child () INHERITS (parent);
+ALTER TABLE list_parted ATTACH PARTITION child FOR VALUES IN (1);
+ERROR:  cannot attach inheritance child as partition
+ALTER TABLE list_parted ATTACH PARTITION parent FOR VALUES IN (1);
+ERROR:  cannot attach inheritance parent as partition
+DROP TABLE parent CASCADE;
+NOTICE:  drop cascades to table child
+-- check any TEMP-ness
+CREATE TEMP TABLE temp_parted (a int) PARTITION BY LIST (a);
+CREATE TABLE perm_part (a int);
+ALTER TABLE temp_parted ATTACH PARTITION perm_part FOR VALUES IN (1);
+ERROR:  cannot attach a permanent relation as partition of temporary relation "temp_parted"
+DROP TABLE temp_parted, perm_part;
+-- check that the table being attached is not a typed table
+CREATE TYPE mytype AS (a int);
+CREATE TABLE fail_part OF mytype;
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  cannot attach a typed table as partition
+DROP TYPE mytype CASCADE;
+NOTICE:  drop cascades to table fail_part
+-- check that the table being attached has only columns present in the parent
+CREATE TABLE fail_part (like list_parted, c int);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  table "fail_part" contains column "c" not found in parent "list_parted"
+DETAIL:  The new partition may contain only the columns present in parent.
+DROP TABLE fail_part;
+-- check that the table being attached has every column of the parent
+CREATE TABLE fail_part (a int NOT NULL);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table is missing column "b"
+DROP TABLE fail_part;
+-- check that columns match in type, collation and NOT NULL status
+CREATE TABLE fail_part (
+	b char(3),
+	a int NOT NULL
+);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different type for column "b"
+ALTER TABLE fail_part ALTER b TYPE char (2) COLLATE "POSIX";
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different collation for column "b"
+DROP TABLE fail_part;
+-- check that the table being attached has all constraints of the parent
+CREATE TABLE fail_part (
+	b char(2) COLLATE "C",
+	a int NOT NULL
+);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table is missing constraint "check_a"
+-- check that the constraint matches in definition with parent's constraint
+ALTER TABLE fail_part ADD CONSTRAINT check_a CHECK (a >= 0);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different definition for check constraint "check_a"
+DROP TABLE fail_part;
+-- check the attributes and constraints after partition is attached
+CREATE TABLE part_1 (
+	a int NOT NULL,
+	b char(2) COLLATE "C",
+	CONSTRAINT check_a CHECK (a > 0)
+);
+ALTER TABLE list_parted ATTACH PARTITION part_1 FOR VALUES IN (1);
+-- attislocal and conislocal are always false for merged attributes and constraints respectively.
+SELECT attislocal, attinhcount FROM pg_attribute WHERE attrelid = 'part_1'::regclass AND attnum > 0;
+ attislocal | attinhcount 
+------------+-------------
+ f          |           1
+ f          |           1
+(2 rows)
+
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::regclass AND conname = 'check_a';
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+(1 row)
+
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  partition "fail_part" would overlap partition "part_1"
+LINE 1: ...LE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+                                                                    ^
+DROP TABLE fail_part;
+-- check that an existing table can be attached as a default partition
+CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
+-- check attaching default partition fails if a default partition already
+-- exists
+CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
+ERROR:  partition "fail_def_part" conflicts with existing default partition "def_part"
+LINE 1: ...ER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
+                                                               ^
+-- check validation when attaching list partitions
+CREATE TABLE list_parted2 (
+	a int,
+	b char
+) PARTITION BY LIST (a);
+-- check that violating rows are correctly reported
+CREATE TABLE part_2 (LIKE list_parted2);
+INSERT INTO part_2 VALUES (3, 'a');
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+ERROR:  partition constraint of relation "part_2" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part_2;
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+-- check partition cannot be attached if default has some row for its values
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+INSERT INTO list_parted2_def VALUES (11, 'z');
+CREATE TABLE part_3 (LIKE list_parted2);
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+ERROR:  updated partition constraint for default partition "list_parted2_def" would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM list_parted2_def WHERE a = 11;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+-- adding constraints that describe the desired partition constraint
+-- (or more restrictive) will help skip the validation scan
+CREATE TABLE part_3_4 (
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IN (3))
+);
+-- however, if a list partition does not accept nulls, there should be
+-- an explicit NOT NULL constraint on the partition key column for the
+-- validation scan to be skipped;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
+-- adding a NOT NULL constraint will cause the scan to be skipped
+ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
+ALTER TABLE part_3_4 ALTER a SET NOT NULL;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
+-- check if default partition scan skipped
+ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6));
+CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66);
+-- check validation when attaching range partitions
+CREATE TABLE range_parted (
+	a int,
+	b int
+) PARTITION BY RANGE (a, b);
+-- check that violating rows are correctly reported
+CREATE TABLE part1 (
+	a int NOT NULL CHECK (a = 1),
+	b int NOT NULL CHECK (b >= 1 AND b <= 10)
+);
+INSERT INTO part1 VALUES (1, 10);
+-- Remember the TO bound is exclusive
+ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10);
+ERROR:  partition constraint of relation "part1" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part1;
+ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10);
+-- adding constraints that describe the desired partition constraint
+-- (or more restrictive) will help skip the validation scan
+CREATE TABLE part2 (
+	a int NOT NULL CHECK (a = 1),
+	b int NOT NULL CHECK (b >= 10 AND b < 18)
+);
+ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20);
+-- Create default partition
+CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT;
+-- Only one default partition is allowed, hence, following should give error
+CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS);
+ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+ERROR:  partition "partr_def2" conflicts with existing default partition "partr_def1"
+LINE 1: ...LTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+                                                               ^
+-- Overlapping partitions cannot be attached, hence, following should give error
+INSERT INTO partr_def1 VALUES (2, 10);
+CREATE TABLE part3 (LIKE range_parted);
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20);
+ERROR:  updated partition constraint for default partition "partr_def1" would be violated by some row
+-- Attaching partitions should be successful when there are no overlapping rows
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE part_5 (
+	LIKE list_parted2
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE part_5_a PARTITION OF part_5 FOR VALUES IN ('a');
+INSERT INTO part_5_a (a, b) VALUES (6, 'a');
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+ERROR:  partition constraint of relation "part_5_a" is violated by some row
+-- delete the faulting row and also add a constraint to skip the scan
+DELETE FROM part_5_a WHERE a NOT IN (3);
+ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 5);
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+ALTER TABLE list_parted2 DETACH PARTITION part_5;
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+-- scan should again be skipped, even though NOT NULL is now a column property
+ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IN (5)), ALTER a SET NOT NULL;
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+-- Check the case where attnos of the partitioning columns in the table being
+-- attached differs from the parent.  It should not affect the constraint-
+-- checking logic that allows to skip the scan.
+CREATE TABLE part_6 (
+	c int,
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 6)
+);
+ALTER TABLE part_6 DROP c;
+ALTER TABLE list_parted2 ATTACH PARTITION part_6 FOR VALUES IN (6);
+-- Similar to above, but the table being attached is a partitioned table
+-- whose partition has still different attnos for the root partitioning
+-- columns.
+CREATE TABLE part_7 (
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7)
+) PARTITION BY LIST (b);
+CREATE TABLE part_7_a_null (
+	c int,
+	d int,
+	e int,
+	LIKE list_parted2,  -- 'a' will have attnum = 4
+	CONSTRAINT check_b CHECK (b IS NULL OR b = 'a'),
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7)
+);
+ALTER TABLE part_7_a_null DROP c, DROP d, DROP e;
+ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null);
+ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+-- Same example, but check this time that the constraint correctly detects
+-- violating rows
+ALTER TABLE list_parted2 DETACH PARTITION part_7;
+ALTER TABLE part_7 DROP CONSTRAINT check_a; -- thusly, scan won't be skipped
+INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a');
+SELECT tableoid::regclass, a, b FROM part_7 order by a;
+   tableoid    | a | b 
+---------------+---+---
+ part_7_a_null | 8 | 
+ part_7_a_null | 9 | a
+(2 rows)
+
+ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+ERROR:  partition constraint of relation "part_7_a_null" is violated by some row
+-- check that leaf partitions of default partition are scanned when
+-- attaching a partitioned table.
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a);
+CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5);
+INSERT INTO part5_def_p1 VALUES (5, 'y');
+CREATE TABLE part5_p1 (LIKE part_5);
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+ERROR:  updated partition constraint for default partition "part5_def_p1" would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part5_def_p1 WHERE b = 'y';
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+-- check that the table being attached is not already a partition
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+ERROR:  "part_2" is already a partition
+-- check that circular inheritance is not allowed
+ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b');
+ERROR:  circular inheritance not allowed
+DETAIL:  "part_5" is already a child of "list_parted2".
+ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
+ERROR:  circular inheritance not allowed
+DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- If a partitioned table being created or an existing table being attached
+-- as a partition does not have a constraint that would allow validation scan
+-- to be skipped, but an individual partition does, then the partition's
+-- validation scan is skipped.
+CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
+CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
+CREATE TABLE quuux_default1 PARTITION OF quuux_default (
+	CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1)
+) FOR VALUES IN ('b');
+CREATE TABLE quuux1 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
+CREATE TABLE quuux2 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
+DROP TABLE quuux1, quuux2;
+-- should validate for quuux1, but not for quuux2
+CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
+CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
+DROP TABLE quuux;
+-- check validation when attaching hash partitions
+-- Use hand-rolled hash functions and operator class to get predictable result
+-- on different machines. part_test_int4_ops is defined in insert.sql.
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+	a int,
+	b int
+) PARTITION BY HASH (a part_test_int4_ops);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU...
+                                                             ^
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU...
+                                                             ^
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint of relation "hpart_2" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+	LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint of relation "hpart_5_a" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DETAIL:  The new modulus 3 is not a factor of 4, the modulus of existing partition "hpart_1".
+DROP TABLE fail_part;
+--
+-- DETACH PARTITION
+--
+-- check that the table is partitioned at all
+CREATE TABLE regular_table (a int);
+ALTER TABLE regular_table DETACH PARTITION any_name;
+ERROR:  table "regular_table" is not partitioned
+DROP TABLE regular_table;
+-- check that the partition being detached exists at all
+ALTER TABLE list_parted2 DETACH PARTITION part_4;
+ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
+-- check that the partition being detached is actually a partition of the parent
+CREATE TABLE not_a_part (a int);
+ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
+ALTER TABLE list_parted2 DETACH PARTITION part_1;
+ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
+-- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
+-- attislocal/conislocal is set to true
+ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
+SELECT attinhcount, attislocal FROM pg_attribute WHERE attrelid = 'part_3_4'::regclass AND attnum > 0;
+ attinhcount | attislocal 
+-------------+------------
+           0 | t
+           0 | t
+(2 rows)
+
+SELECT coninhcount, conislocal FROM pg_constraint WHERE conrelid = 'part_3_4'::regclass AND conname = 'check_a';
+ coninhcount | conislocal 
+-------------+------------
+           0 | t
+(1 row)
+
+DROP TABLE part_3_4;
+-- check that a detached partition is not dropped on dropping a partitioned table
+CREATE TABLE range_parted2 (
+    a int
+) PARTITION BY RANGE(a);
+CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100);
+ALTER TABLE range_parted2 DETACH PARTITION part_rp;
+DROP TABLE range_parted2;
+SELECT * from part_rp;
+ a 
+---
+(0 rows)
+
+DROP TABLE part_rp;
+-- concurrent detach
+CREATE TABLE range_parted2 (
+	a int
+) PARTITION BY RANGE(a);
+CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100);
+BEGIN;
+-- doesn't work in a partition block
+ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY;
+ERROR:  ALTER TABLE ... DETACH CONCURRENTLY cannot run inside a transaction block
+COMMIT;
+CREATE TABLE part_rpd PARTITION OF range_parted2 DEFAULT;
+-- doesn't work if there's a default partition
+ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY;
+ERROR:  cannot detach partitions concurrently when a default partition exists
+-- doesn't work for the default partition
+ALTER TABLE range_parted2 DETACH PARTITION part_rpd CONCURRENTLY;
+ERROR:  cannot detach partitions concurrently when a default partition exists
+DROP TABLE part_rpd;
+-- works fine
+ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY;
+\d+ range_parted2
+                         Partitioned table "public.range_parted2"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: RANGE (a)
+Number of partitions: 0
+
+-- constraint should be created
+\d part_rp
+              Table "public.part_rp"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Check constraints:
+    "part_rp_a_check" CHECK (a IS NOT NULL AND a >= 0 AND a < 100)
+
+CREATE TABLE part_rp100 PARTITION OF range_parted2 (CHECK (a>=123 AND a<133 AND a IS NOT NULL)) FOR VALUES FROM (100) to (200);
+ALTER TABLE range_parted2 DETACH PARTITION part_rp100 CONCURRENTLY;
+-- redundant constraint should not be created
+\d part_rp100
+             Table "public.part_rp100"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Check constraints:
+    "part_rp100_a_check" CHECK (a >= 123 AND a < 133 AND a IS NOT NULL)
+
+DROP TABLE range_parted2;
+-- Check ALTER TABLE commands for partitioned tables and partitions
+-- cannot add/drop column to/from *only* the parent
+ALTER TABLE ONLY list_parted2 ADD COLUMN c int;
+ERROR:  column must be added to child tables too
+ALTER TABLE ONLY list_parted2 DROP COLUMN b;
+ERROR:  cannot drop column from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+-- cannot add a column to partition or drop an inherited one
+ALTER TABLE part_2 ADD COLUMN c text;
+ERROR:  cannot add column to a partition
+ALTER TABLE part_2 DROP COLUMN b;
+ERROR:  cannot drop inherited column "b"
+-- Nor rename, alter type
+ALTER TABLE part_2 RENAME COLUMN b to c;
+ERROR:  cannot rename inherited column "b"
+ALTER TABLE part_2 ALTER COLUMN b TYPE text;
+ERROR:  cannot alter inherited column "b"
+-- cannot add/drop NOT NULL or check constraints to *only* the parent, when
+-- partitions exist
+ALTER TABLE ONLY list_parted2 ALTER b SET NOT NULL;
+ERROR:  constraint must be added to child tables too
+DETAIL:  Column "b" of relation "part_2" is not already NOT NULL.
+HINT:  Do not specify the ONLY keyword.
+ALTER TABLE ONLY list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz');
+ERROR:  constraint must be added to child tables too
+ALTER TABLE list_parted2 ALTER b SET NOT NULL;
+ALTER TABLE ONLY list_parted2 ALTER b DROP NOT NULL;
+ERROR:  cannot remove constraint from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+ALTER TABLE list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz');
+ALTER TABLE ONLY list_parted2 DROP CONSTRAINT check_b;
+ERROR:  cannot remove constraint from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+-- It's alright though, if no partitions are yet created
+CREATE TABLE parted_no_parts (a int) PARTITION BY LIST (a);
+ALTER TABLE ONLY parted_no_parts ALTER a SET NOT NULL;
+ALTER TABLE ONLY parted_no_parts ADD CONSTRAINT check_a CHECK (a > 0);
+ALTER TABLE ONLY parted_no_parts ALTER a DROP NOT NULL;
+ALTER TABLE ONLY parted_no_parts DROP CONSTRAINT check_a;
+DROP TABLE parted_no_parts;
+-- cannot drop inherited NOT NULL or check constraints from partition
+ALTER TABLE list_parted2 ALTER b SET NOT NULL, ADD CONSTRAINT check_a2 CHECK (a > 0);
+ALTER TABLE part_2 ALTER b DROP NOT NULL;
+ERROR:  column "b" is marked NOT NULL in parent table
+ALTER TABLE part_2 DROP CONSTRAINT check_a2;
+ERROR:  cannot drop inherited constraint "check_a2" of relation "part_2"
+-- Doesn't make sense to add NO INHERIT constraints on partitioned tables
+ALTER TABLE list_parted2 add constraint check_b2 check (b <> 'zz') NO INHERIT;
+ERROR:  cannot add NO INHERIT constraint to partitioned table "list_parted2"
+-- check that a partition cannot participate in regular inheritance
+CREATE TABLE inh_test () INHERITS (part_2);
+ERROR:  cannot inherit from partition "part_2"
+CREATE TABLE inh_test (LIKE part_2);
+ALTER TABLE inh_test INHERIT part_2;
+ERROR:  cannot inherit from a partition
+ALTER TABLE part_2 INHERIT inh_test;
+ERROR:  cannot change inheritance of a partition
+-- cannot drop or alter type of partition key columns of lower level
+-- partitioned tables; for example, part_5, which is list_parted2's
+-- partition, is partitioned on b;
+ALTER TABLE list_parted2 DROP COLUMN b;
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "part_5"
+ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "part_5"
+-- dropping non-partition key columns should be allowed on the parent table.
+ALTER TABLE list_parted DROP COLUMN b;
+SELECT * FROM list_parted;
+ a 
+---
+(0 rows)
+
+-- cleanup
+DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE fail_def_part;
+DROP TABLE hash_parted;
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass)
+order by attrelid::regclass::text;
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+insert into p1 (a, b) values (2, 3);
+-- check that partition validation scan correctly detects violating rows
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+ERROR:  partition constraint of relation "p11" is violated by some row
+-- cleanup
+drop table p;
+drop table p1;
+-- validate constraint on partitioned tables should only scan leaf partitions
+create table parted_validate_test (a int) partition by list (a);
+create table parted_validate_test_1 partition of parted_validate_test for values in (0, 1);
+alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
+alter table parted_validate_test validate constraint parted_validate_test_chka;
+drop table parted_validate_test;
+-- test alter column options
+CREATE TABLE attmp(i integer);
+INSERT INTO attmp VALUES (1);
+ALTER TABLE attmp ALTER COLUMN i SET (n_distinct = 1, n_distinct_inherited = 2);
+ALTER TABLE attmp ALTER COLUMN i RESET (n_distinct_inherited);
+ANALYZE attmp;
+DROP TABLE attmp;
+DROP USER regress_alter_table_user1;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (b int, a int);
+alter table defpart_attach_test_d drop b;
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint of relation "defpart_attach_test_d" is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+-- check that attaching a partition correctly reports any rows in the default
+-- partition that should not be there for the new partition to be attached
+-- successfully
+create table defpart_attach_test_2 (like defpart_attach_test_d);
+alter table defpart_attach_test attach partition defpart_attach_test_2 for values in (2);
+ERROR:  updated partition constraint for default partition "defpart_attach_test_d" would be violated by some row
+drop table defpart_attach_test;
+-- check combinations of temporary and permanent relations when attaching
+-- partitions.
+create table perm_part_parent (a int) partition by list (a);
+create temp table temp_part_parent (a int) partition by list (a);
+create table perm_part_child (a int);
+create temp table temp_part_child (a int);
+alter table temp_part_parent attach partition perm_part_child default; -- error
+ERROR:  cannot attach a permanent relation as partition of temporary relation "temp_part_parent"
+alter table perm_part_parent attach partition temp_part_child default; -- error
+ERROR:  cannot attach a temporary relation as partition of permanent relation "perm_part_parent"
+alter table temp_part_parent attach partition temp_part_child default; -- ok
+drop table perm_part_parent cascade;
+drop table temp_part_parent cascade;
+-- check that attaching partitions to a table while it is being used is
+-- prevented
+create table tab_part_attach (a int) partition by list (a);
+create or replace function func_part_attach() returns trigger
+  language plpgsql as $$
+  begin
+    execute 'create table tab_part_attach_1 (a int)';
+    execute 'alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)';
+    return null;
+  end $$;
+create trigger trig_part_attach before insert on tab_part_attach
+  for each statement execute procedure func_part_attach();
+insert into tab_part_attach values (1);
+ERROR:  cannot ALTER TABLE "tab_part_attach" because it is being used by active queries in this session
+CONTEXT:  SQL statement "alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)"
+PL/pgSQL function func_part_attach() line 4 at EXECUTE
+drop table tab_part_attach;
+drop function func_part_attach();
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
+/* Test case for bug #16242 */
+-- We create a parent and child where the child has missing
+-- non-null attribute values, and arrange to pass them through
+-- tuple conversion from the child to the parent tupdesc
+create table bar1 (a integer, b integer not null default 1)
+  partition by range (a);
+create table bar2 (a integer);
+insert into bar2 values (1);
+alter table bar2 add column b integer not null default 1;
+-- (at this point bar2 contains tuple with natts=1)
+alter table bar1 attach partition bar2 default;
+-- this works:
+select * from bar1;
+ a | b 
+---+---
+ 1 | 1
+(1 row)
+
+-- this exercises tuple conversion:
+create function xtrig()
+  returns trigger language plpgsql
+as $$
+  declare
+    r record;
+  begin
+    for r in select * from old loop
+      raise info 'a=%, b=%', r.a, r.b;
+    end loop;
+    return NULL;
+  end;
+$$;
+create trigger xtrig
+  after update on bar1
+  referencing old table as old
+  for each statement execute procedure xtrig();
+update bar1 set a = a + 1;
+INFO:  a=1, b=1
+/* End test case for bug #16242 */
+-- Test that ALTER TABLE rewrite preserves a clustered index
+-- for normal indexes and indexes on constraints.
+create table alttype_cluster (a int);
+alter table alttype_cluster add primary key (a);
+create index alttype_cluster_ind on alttype_cluster (a);
+alter table alttype_cluster cluster on alttype_cluster_ind;
+-- Normal index remains clustered.
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | t
+ alttype_cluster_pkey | f
+(2 rows)
+
+alter table alttype_cluster alter a type bigint;
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | t
+ alttype_cluster_pkey | f
+(2 rows)
+
+-- Constraint index remains clustered.
+alter table alttype_cluster cluster on alttype_cluster_pkey;
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | f
+ alttype_cluster_pkey | t
+(2 rows)
+
+alter table alttype_cluster alter a type int;
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | f
+ alttype_cluster_pkey | t
+(2 rows)
+
+drop table alttype_cluster;
diff --git a/src/test/regress/expected/create_table_1.out b/src/test/regress/expected/create_table_1.out
new file mode 100644
index 00000000000..4ec5f297a34
--- /dev/null
+++ b/src/test/regress/expected/create_table_1.out
@@ -0,0 +1,1315 @@
+--
+-- CREATE_TABLE
+--
+--
+-- CLASS DEFINITIONS
+--
+CREATE TABLE hobbies_r (
+	name		text,
+	person 		text
+);
+CREATE TABLE equipment_r (
+	name 		text,
+	hobby		text
+);
+CREATE TABLE onek (
+	unique1		int4,
+	unique2		int4,
+	two			int4,
+	four		int4,
+	ten			int4,
+	twenty		int4,
+	hundred		int4,
+	thousand	int4,
+	twothousand	int4,
+	fivethous	int4,
+	tenthous	int4,
+	odd			int4,
+	even		int4,
+	stringu1	name,
+	stringu2	name,
+	string4		name
+);
+CREATE TABLE tenk1 (
+	unique1		int4,
+	unique2		int4,
+	two			int4,
+	four		int4,
+	ten			int4,
+	twenty		int4,
+	hundred		int4,
+	thousand	int4,
+	twothousand	int4,
+	fivethous	int4,
+	tenthous	int4,
+	odd			int4,
+	even		int4,
+	stringu1	name,
+	stringu2	name,
+	string4		name
+);
+CREATE TABLE tenk2 (
+	unique1 	int4,
+	unique2 	int4,
+	two 	 	int4,
+	four 		int4,
+	ten			int4,
+	twenty 		int4,
+	hundred 	int4,
+	thousand 	int4,
+	twothousand int4,
+	fivethous 	int4,
+	tenthous	int4,
+	odd			int4,
+	even		int4,
+	stringu1	name,
+	stringu2	name,
+	string4		name
+);
+CREATE TABLE person (
+	name 		text,
+	age			int4,
+	location 	point
+);
+CREATE TABLE emp (
+	salary 		int4,
+	manager 	name
+) INHERITS (person);
+CREATE TABLE student (
+	gpa 		float8
+) INHERITS (person);
+CREATE TABLE stud_emp (
+	percent 	int4
+) INHERITS (emp, student);
+NOTICE:  merging multiple inherited definitions of column "name"
+NOTICE:  merging multiple inherited definitions of column "age"
+NOTICE:  merging multiple inherited definitions of column "location"
+CREATE TABLE city (
+	name		name,
+	location 	box,
+	budget 		city_budget
+);
+CREATE TABLE dept (
+	dname		name,
+	mgrname 	text
+);
+CREATE TABLE slow_emp4000 (
+	home_base	 box
+);
+CREATE TABLE fast_emp4000 (
+	home_base	 box
+);
+CREATE TABLE road (
+	name		text,
+	thepath 	path
+);
+CREATE TABLE ihighway () INHERITS (road);
+CREATE TABLE shighway (
+	surface		text
+) INHERITS (road);
+CREATE TABLE real_city (
+	pop			int4,
+	cname		text,
+	outline 	path
+);
+--
+-- test the "star" operators a bit more thoroughly -- this time,
+-- throw in lots of NULL fields...
+--
+-- a is the type root
+-- b and c inherit from a (one-level single inheritance)
+-- d inherits from b and c (two-level multiple inheritance)
+-- e inherits from c (two-level single inheritance)
+-- f inherits from e (three-level single inheritance)
+--
+CREATE TABLE a_star (
+	class		char,
+	a 			int4
+);
+CREATE TABLE b_star (
+	b 			text
+) INHERITS (a_star);
+CREATE TABLE c_star (
+	c 			name
+) INHERITS (a_star);
+CREATE TABLE d_star (
+	d 			float8
+) INHERITS (b_star, c_star);
+NOTICE:  merging multiple inherited definitions of column "class"
+NOTICE:  merging multiple inherited definitions of column "a"
+CREATE TABLE e_star (
+	e 			int2
+) INHERITS (c_star);
+CREATE TABLE f_star (
+	f 			polygon
+) INHERITS (e_star);
+CREATE TABLE aggtest (
+	a 			int2,
+	b			float4
+);
+CREATE TABLE hash_i4_heap (
+	seqno 		int4,
+	random 		int4
+);
+CREATE TABLE hash_name_heap (
+	seqno 		int4,
+	random 		name
+);
+CREATE TABLE hash_txt_heap (
+	seqno 		int4,
+	random 		text
+);
+CREATE TABLE hash_f8_heap (
+	seqno		int4,
+	random 		float8
+);
+-- don't include the hash_ovfl_heap stuff in the distribution
+-- the data set is too large for what it's worth
+--
+-- CREATE TABLE hash_ovfl_heap (
+--	x			int4,
+--	y			int4
+-- );
+CREATE TABLE bt_i4_heap (
+	seqno 		int4,
+	random 		int4
+);
+CREATE TABLE bt_name_heap (
+	seqno 		name,
+	random 		int4
+);
+CREATE TABLE bt_txt_heap (
+	seqno 		text,
+	random 		int4
+);
+CREATE TABLE bt_f8_heap (
+	seqno 		float8,
+	random 		int4
+);
+CREATE TABLE array_op_test (
+	seqno		int4,
+	i			int4[],
+	t			text[]
+);
+CREATE TABLE array_index_op_test (
+	seqno		int4,
+	i			int4[],
+	t			text[]
+);
+CREATE TABLE testjsonb (
+       j jsonb
+);
+CREATE TABLE unknowntab (
+	u unknown    -- fail
+);
+ERROR:  column "u" has pseudo-type unknown
+CREATE TYPE unknown_comptype AS (
+	u unknown    -- fail
+);
+ERROR:  column "u" has pseudo-type unknown
+CREATE TABLE IF NOT EXISTS test_tsvector(
+	t text,
+	a tsvector
+);
+CREATE TABLE IF NOT EXISTS test_tsvector(
+	t text
+);
+NOTICE:  relation "test_tsvector" already exists, skipping
+-- invalid: non-lowercase quoted reloptions identifiers
+CREATE TABLE tas_case WITH ("Fillfactor" = 10) AS SELECT 1 a;
+ERROR:  unrecognized parameter "Fillfactor"
+CREATE UNLOGGED TABLE unlogged1 (a int primary key);			-- OK
+CREATE TEMPORARY TABLE unlogged2 (a int primary key);			-- OK
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ unlogged1      | r       | p
+ unlogged1_pkey | i       | p
+ unlogged2      | r       | t
+ unlogged2_pkey | i       | t
+(4 rows)
+
+REINDEX INDEX unlogged1_pkey;
+REINDEX INDEX unlogged2_pkey;
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ unlogged1      | r       | p
+ unlogged1_pkey | i       | p
+ unlogged2      | r       | t
+ unlogged2_pkey | i       | t
+(4 rows)
+
+DROP TABLE unlogged2;
+INSERT INTO unlogged1 VALUES (42);
+CREATE UNLOGGED TABLE public.unlogged2 (a int primary key);		-- also OK
+CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key);	-- not OK
+ERROR:  only temporary relations may be created in temporary schemas
+LINE 1: CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key);
+                              ^
+CREATE TABLE pg_temp.implicitly_temp (a int primary key);		-- OK
+CREATE TEMP TABLE explicitly_temp (a int primary key);			-- also OK
+CREATE TEMP TABLE pg_temp.doubly_temp (a int primary key);		-- also OK
+CREATE TEMP TABLE public.temp_to_perm (a int primary key);		-- not OK
+ERROR:  cannot create temporary relation in non-temporary schema
+LINE 1: CREATE TEMP TABLE public.temp_to_perm (a int primary key);
+                          ^
+DROP TABLE unlogged1, public.unlogged2;
+CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r';
+CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r';
+ERROR:  relation "as_select1" already exists
+CREATE TABLE IF NOT EXISTS as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r';
+NOTICE:  relation "as_select1" already exists, skipping
+DROP TABLE as_select1;
+PREPARE select1 AS SELECT 1 as a;
+CREATE TABLE as_select1 AS EXECUTE select1;
+CREATE TABLE as_select1 AS EXECUTE select1;
+ERROR:  relation "as_select1" already exists
+SELECT * FROM as_select1;
+ a 
+---
+ 1
+(1 row)
+
+CREATE TABLE IF NOT EXISTS as_select1 AS EXECUTE select1;
+NOTICE:  relation "as_select1" already exists, skipping
+DROP TABLE as_select1;
+DEALLOCATE select1;
+-- create an extra wide table to test for issues related to that
+-- (temporarily hide query, to avoid the long CREATE TABLE stmt)
+\set ECHO none
+INSERT INTO extra_wide_table(firstc, lastc) VALUES('first col', 'last col');
+SELECT firstc, lastc FROM extra_wide_table;
+  firstc   |  lastc   
+-----------+----------
+ first col | last col
+(1 row)
+
+-- check that tables with oids cannot be created anymore
+CREATE TABLE withoid() WITH OIDS;
+ERROR:  syntax error at or near "OIDS"
+LINE 1: CREATE TABLE withoid() WITH OIDS;
+                                    ^
+CREATE TABLE withoid() WITH (oids);
+ERROR:  tables declared WITH OIDS are not supported
+CREATE TABLE withoid() WITH (oids = true);
+ERROR:  tables declared WITH OIDS are not supported
+-- but explicitly not adding oids is still supported
+CREATE TEMP TABLE withoutoid() WITHOUT OIDS; DROP TABLE withoutoid;
+CREATE TEMP TABLE withoutoid() WITH (oids = false); DROP TABLE withoutoid;
+-- check restriction with default expressions
+-- invalid use of column reference in default expressions
+CREATE TABLE default_expr_column (id int DEFAULT (id));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (id));
+                                                          ^
+CREATE TABLE default_expr_column (id int DEFAULT (bar.id));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (bar.id));
+                                                          ^
+CREATE TABLE default_expr_agg_column (id int DEFAULT (avg(id)));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: ...TE TABLE default_expr_agg_column (id int DEFAULT (avg(id)));
+                                                                 ^
+-- invalid column definition
+CREATE TABLE default_expr_non_column (a int DEFAULT (avg(non_existent)));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: ...TABLE default_expr_non_column (a int DEFAULT (avg(non_existe...
+                                                             ^
+-- invalid use of aggregate
+CREATE TABLE default_expr_agg (a int DEFAULT (avg(1)));
+ERROR:  aggregate functions are not allowed in DEFAULT expressions
+LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (avg(1)));
+                                                      ^
+-- invalid use of subquery
+CREATE TABLE default_expr_agg (a int DEFAULT (select 1));
+ERROR:  cannot use subquery in DEFAULT expression
+LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (select 1));
+                                                     ^
+-- invalid use of set-returning function
+CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3)));
+ERROR:  set-returning functions are not allowed in DEFAULT expressions
+LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (generate_serie...
+                                                      ^
+-- Verify that subtransaction rollback restores rd_createSubid.
+BEGIN;
+CREATE TABLE remember_create_subid (c int);
+SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q;
+COMMIT;
+DROP TABLE remember_create_subid;
+-- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid.
+CREATE TABLE remember_node_subid (c int);
+BEGIN;
+ALTER TABLE remember_node_subid ALTER c TYPE bigint;
+SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q;
+COMMIT;
+DROP TABLE remember_node_subid;
+--
+-- Partitioned tables
+--
+-- cannot combine INHERITS and PARTITION BY (although grammar allows)
+CREATE TABLE partitioned (
+	a int
+) INHERITS (some_table) PARTITION BY LIST (a);
+ERROR:  cannot create partitioned table as inheritance child
+-- cannot use more than 1 column as partition key for list partitioned table
+CREATE TABLE partitioned (
+	a1 int,
+	a2 int
+) PARTITION BY LIST (a1, a2);	-- fail
+ERROR:  cannot use "list" partition strategy with more than one column
+-- unsupported constraint type for partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	EXCLUDE USING gist (a WITH &&)
+) PARTITION BY RANGE (a);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 3:  EXCLUDE USING gist (a WITH &&)
+         ^
+-- prevent using prohibited expressions in the key
+CREATE FUNCTION retset (a int) RETURNS SETOF int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE;
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (retset(a));
+ERROR:  set-returning functions are not allowed in partition key expressions
+DROP FUNCTION retset(int);
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE ((avg(a)));
+ERROR:  aggregate functions are not allowed in partition key expressions
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE ((avg(a) OVER (PARTITION BY b)));
+ERROR:  window functions are not allowed in partition key expressions
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY LIST ((a LIKE (SELECT 1)));
+ERROR:  cannot use subquery in partition key expression
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE ((42));
+ERROR:  cannot use constant expression as partition key
+CREATE FUNCTION const_func () RETURNS int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE;
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (const_func());
+ERROR:  cannot use constant expression as partition key
+DROP FUNCTION const_func();
+-- only accept valid partitioning strategy
+CREATE TABLE partitioned (
+    a int
+) PARTITION BY MAGIC (a);
+ERROR:  unrecognized partitioning strategy "magic"
+-- specified column must be present in the table
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (b);
+ERROR:  column "b" named in partition key does not exist
+LINE 3: ) PARTITION BY RANGE (b);
+                              ^
+-- cannot use system columns in partition key
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (xmin);
+ERROR:  cannot use system column "xmin" in partition key
+LINE 3: ) PARTITION BY RANGE (xmin);
+                              ^
+-- cannot use pseudotypes
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (((a, b)));
+ERROR:  partition key column 1 has pseudo-type record
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (a, ('unknown'));
+ERROR:  partition key column 2 has pseudo-type unknown
+-- functions in key must be immutable
+CREATE FUNCTION immut_func (a int) RETURNS int AS $$ SELECT a + random()::int; $$ LANGUAGE SQL;
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (immut_func(a));
+ERROR:  functions in partition key expression must be marked IMMUTABLE
+DROP FUNCTION immut_func(int);
+-- prevent using columns of unsupported types in key (type must have a btree operator class)
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY LIST (a);
+ERROR:  data type point has no default operator class for access method "btree"
+HINT:  You must specify a btree operator class or define a default btree operator class for the data type.
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY LIST (a point_ops);
+ERROR:  operator class "point_ops" does not exist for access method "btree"
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY RANGE (a);
+ERROR:  data type point has no default operator class for access method "btree"
+HINT:  You must specify a btree operator class or define a default btree operator class for the data type.
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY RANGE (a point_ops);
+ERROR:  operator class "point_ops" does not exist for access method "btree"
+-- cannot add NO INHERIT constraints to partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	CONSTRAINT check_a CHECK (a > 0) NO INHERIT
+) PARTITION BY RANGE (a);
+ERROR:  cannot add NO INHERIT constraint to partitioned table "partitioned"
+-- some checks after successful creation of a partitioned table
+CREATE FUNCTION plusone(a int) RETURNS INT AS $$ SELECT a+1; $$ LANGUAGE SQL;
+CREATE TABLE partitioned (
+	a int,
+	b int,
+	c text,
+	d text
+) PARTITION BY RANGE (a oid_ops, plusone(b), c collate "default", d collate "C");
+-- check relkind
+SELECT relkind FROM pg_class WHERE relname = 'partitioned';
+ relkind 
+---------
+ p
+(1 row)
+
+-- prevent a function referenced in partition key from being dropped
+DROP FUNCTION plusone(int);
+ERROR:  cannot drop function plusone(integer) because other objects depend on it
+DETAIL:  table partitioned depends on function plusone(integer)
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+-- partitioned table cannot participate in regular inheritance
+CREATE TABLE partitioned2 (
+	a int,
+	b text
+) PARTITION BY RANGE ((a+1), substr(b, 1, 5));
+CREATE TABLE fail () INHERITS (partitioned2);
+ERROR:  cannot inherit from partitioned table "partitioned2"
+-- Partition key in describe output
+\d partitioned
+      Partitioned table "public.partitioned"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+ d      | text    |           |          | 
+Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C")
+Number of partitions: 0
+
+\d+ partitioned2
+                          Partitioned table "public.partitioned2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer |           |          |         | plain    |              | 
+ b      | text    |           |          |         | extended |              | 
+Partition key: RANGE (((a + 1)), substr(b, 1, 5))
+Number of partitions: 0
+
+INSERT INTO partitioned2 VALUES (1, 'hello');
+ERROR:  no partition of relation "partitioned2" found for row
+DETAIL:  Partition key of the failing row contains ((a + 1), substr(b, 1, 5)) = (2, hello).
+CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc');
+\d+ part2_1
+                                  Table "public.part2_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer |           |          |         | plain    |              | 
+ b      | text    |           |          |         | extended |              | 
+Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc')
+Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text))))
+
+DROP TABLE partitioned, partitioned2;
+-- check reference to partitioned table's rowtype in partition descriptor
+create table partitioned (a int, b int)
+  partition by list ((row(a, b)::partitioned));
+create table partitioned1
+  partition of partitioned for values in ('(1,2)'::partitioned);
+create table partitioned2
+  partition of partitioned for values in ('(2,4)'::partitioned);
+explain (costs off)
+select * from partitioned where row(a,b)::partitioned = '(1,2)'::partitioned;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Seq Scan on partitioned1 partitioned
+   Filter: (ROW(a, b)::partitioned = '(1,2)'::partitioned)
+(2 rows)
+
+drop table partitioned;
+-- whole-row Var in partition key works too
+create table partitioned (a int, b int)
+  partition by list ((partitioned));
+create table partitioned1
+  partition of partitioned for values in ('(1,2)');
+create table partitioned2
+  partition of partitioned for values in ('(2,4)');
+explain (costs off)
+select * from partitioned where partitioned = '(1,2)'::partitioned;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Seq Scan on partitioned1 partitioned
+   Filter: ((partitioned.*)::partitioned = '(1,2)'::partitioned)
+(2 rows)
+
+\d+ partitioned1
+                               Table "public.partitioned1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+Partition of: partitioned FOR VALUES IN ('(1,2)')
+Partition constraint: (((partitioned1.*)::partitioned IS DISTINCT FROM NULL) AND ((partitioned1.*)::partitioned = '(1,2)'::partitioned))
+
+drop table partitioned;
+-- check that dependencies of partition columns are handled correctly
+create domain intdom1 as int;
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (a);
+alter table partitioned drop column a;  -- fail
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+drop domain intdom1;  -- fail, requires cascade
+ERROR:  cannot drop type intdom1 because other objects depend on it
+DETAIL:  table partitioned depends on type intdom1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+drop domain intdom1 cascade;
+NOTICE:  drop cascades to table partitioned
+table partitioned;  -- gone
+ERROR:  relation "partitioned" does not exist
+LINE 1: table partitioned;
+              ^
+-- likewise for columns used in partition expressions
+create domain intdom1 as int;
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (plusone(a));
+alter table partitioned drop column a;  -- fail
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+drop domain intdom1;  -- fail, requires cascade
+ERROR:  cannot drop type intdom1 because other objects depend on it
+DETAIL:  table partitioned depends on type intdom1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+drop domain intdom1 cascade;
+NOTICE:  drop cascades to table partitioned
+table partitioned;  -- gone
+ERROR:  relation "partitioned" does not exist
+LINE 1: table partitioned;
+              ^
+--
+-- Partitions
+--
+-- check partition bound syntax
+CREATE TABLE list_parted (
+	a int
+) PARTITION BY LIST (a);
+CREATE TABLE part_p1 PARTITION OF list_parted FOR VALUES IN ('1');
+CREATE TABLE part_p2 PARTITION OF list_parted FOR VALUES IN (2);
+CREATE TABLE part_p3 PARTITION OF list_parted FOR VALUES IN ((2+1));
+CREATE TABLE part_null PARTITION OF list_parted FOR VALUES IN (null);
+\d+ list_parted
+                          Partitioned table "public.list_parted"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Partitions: part_null FOR VALUES IN (NULL),
+            part_p1 FOR VALUES IN (1),
+            part_p2 FOR VALUES IN (2),
+            part_p3 FOR VALUES IN (3)
+
+-- forbidden expressions for partition bound with list partitioned table
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename);
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename);
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename.somename);
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename.s...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a);
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ..._bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a);
+                                                                    ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a));
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a));
+                                                                   ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(somename));
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ..._fail PARTITION OF list_parted FOR VALUES IN (sum(somename))...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1));
+ERROR:  aggregate functions are not allowed in partition bound
+LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1));
+                                                               ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1));
+ERROR:  cannot use subquery in partition bound
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1)...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (generate_series(4, 6));
+ERROR:  set-returning functions are not allowed in partition bound
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (generate_s...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "POSIX");
+ERROR:  collations are not supported by type integer
+LINE 1: ...ail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "P...
+                                                             ^
+-- syntax does not allow empty list of values for list partitions
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN ();
+ERROR:  syntax error at or near ")"
+LINE 1: ...E TABLE fail_part PARTITION OF list_parted FOR VALUES IN ();
+                                                                     ^
+-- trying to specify range for list partitioned table
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T...
+                                                             ^
+-- trying to specify modulus and remainder for list partitioned table
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- check default partition cannot be created more than once
+CREATE TABLE part_default PARTITION OF list_parted DEFAULT;
+CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
+ERROR:  partition "fail_default_part" conflicts with existing default partition "part_default"
+LINE 1: ...TE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
+                                                               ^
+-- specified literal can't be cast to the partition column data type
+CREATE TABLE bools (
+	a bool
+) PARTITION BY LIST (a);
+CREATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1);
+ERROR:  specified value cannot be cast to type boolean for column "a"
+LINE 1: ...REATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1);
+                                                                    ^
+DROP TABLE bools;
+-- specified literal can be cast, and the cast might not be immutable
+CREATE TABLE moneyp (
+	a money
+) PARTITION BY LIST (a);
+CREATE TABLE moneyp_10 PARTITION OF moneyp FOR VALUES IN (10);
+CREATE TABLE moneyp_11 PARTITION OF moneyp FOR VALUES IN ('11');
+CREATE TABLE moneyp_12 PARTITION OF moneyp FOR VALUES IN (to_char(12, '99')::int);
+DROP TABLE moneyp;
+-- cast is immutable
+CREATE TABLE bigintp (
+	a bigint
+) PARTITION BY LIST (a);
+CREATE TABLE bigintp_10 PARTITION OF bigintp FOR VALUES IN (10);
+-- fails due to overlap:
+CREATE TABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10');
+ERROR:  partition "bigintp_10_2" would overlap partition "bigintp_10"
+LINE 1: ...ABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10');
+                                                                 ^
+DROP TABLE bigintp;
+CREATE TABLE range_parted (
+	a date
+) PARTITION BY RANGE (a);
+-- forbidden expressions for partition bounds with range partitioned table
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (somename) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (somename) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (somename.somename) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (somename.somename) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (a) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (a) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (max(a)) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (max(a)) TO ('2019-01-01');
+                               ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (max(somename)) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (max(somename)) TO ('2019-01-01');
+                               ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01');
+ERROR:  aggregate functions are not allowed in partition bound
+LINE 2:   FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01'...
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM ((select 1)) TO ('2019-01-01');
+ERROR:  cannot use subquery in partition bound
+LINE 2:   FOR VALUES FROM ((select 1)) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01');
+ERROR:  set-returning functions are not allowed in partition bound
+LINE 2:   FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01');
+                           ^
+-- trying to specify list for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...BLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
+                                                              ^
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- each of start and end bounds must have same number of values as the
+-- length of the partition key
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z');
+ERROR:  FROM must specify exactly one value per partitioning column
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a') TO ('z', 1);
+ERROR:  TO must specify exactly one value per partitioning column
+-- cannot specify null values in range bounds
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue);
+ERROR:  cannot specify NULL in range bound
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- check partition bound syntax for the hash partition
+CREATE TABLE hash_parted (
+	a int
+) PARTITION BY HASH (a);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0);
+CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1);
+CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2);
+-- modulus 25 is factor of modulus of 50 but 10 is not a factor of 25.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DETAIL:  The new modulus 25 is not divisible by 10, the modulus of existing partition "hpart_1".
+-- previous modulus 50 is factor of 150 but this modulus is not a factor of next modulus 200.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DETAIL:  The new modulus 150 is not a factor of 200, the modulus of existing partition "hpart_3".
+-- trying to specify range for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z');
+ERROR:  invalid bound specification for a hash partition
+LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a',...
+                                                             ^
+-- trying to specify list value for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+ERROR:  invalid bound specification for a hash partition
+LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+                                                             ^
+-- trying to create default partition for the hash partitioned table
+CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT;
+ERROR:  a hash-partitioned table may not have a default partition
+-- check if compatible with the specified parent
+-- cannot create as partition of a non-partitioned table
+CREATE TABLE unparted (
+	a int
+);
+CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a');
+ERROR:  "unparted" is not partitioned
+CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+ERROR:  "unparted" is not partitioned
+DROP TABLE unparted;
+-- cannot create a permanent rel as partition of a temp rel
+CREATE TEMP TABLE temp_parted (
+	a int
+) PARTITION BY LIST (a);
+CREATE TABLE fail_part PARTITION OF temp_parted FOR VALUES IN ('a');
+ERROR:  cannot create a permanent relation as partition of temporary relation "temp_parted"
+DROP TABLE temp_parted;
+-- check for partition bound overlap and other invalid specifications
+CREATE TABLE list_parted2 (
+	a varchar
+) PARTITION BY LIST (a);
+CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z');
+CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b');
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null);
+ERROR:  partition "fail_part" would overlap partition "part_null_z"
+LINE 1: ...LE fail_part PARTITION OF list_parted2 FOR VALUES IN (null);
+                                                                 ^
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c');
+ERROR:  partition "fail_part" would overlap partition "part_ab"
+LINE 1: ...ail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c');
+                                                             ^
+-- check default partition overlap
+INSERT INTO list_parted2 VALUES('X');
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y');
+ERROR:  updated partition constraint for default partition "list_parted2_def" would be violated by some row
+CREATE TABLE range_parted2 (
+	a int
+) PARTITION BY RANGE (a);
+-- trying to create range partition with empty range
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0);
+ERROR:  empty range bound specified for partition "fail_part"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0);
+                                                             ^
+DETAIL:  Specified lower bound (1) is greater than or equal to upper bound (0).
+-- note that the range '[1, 1)' has no elements
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1);
+ERROR:  empty range bound specified for partition "fail_part"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1);
+                                                             ^
+DETAIL:  Specified lower bound (1) is greater than or equal to upper bound (1).
+CREATE TABLE part0 PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (1);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (2);
+ERROR:  partition "fail_part" would overlap partition "part0"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) ...
+                                                             ^
+CREATE TABLE part1 PARTITION OF range_parted2 FOR VALUES FROM (1) TO (10);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1);
+ERROR:  partition "fail_part" would overlap partition "part0"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1)...
+                                                             ^
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (maxvalue);
+ERROR:  partition "fail_part" would overlap partition "part1"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (max...
+                                                             ^
+CREATE TABLE part2 PARTITION OF range_parted2 FOR VALUES FROM (20) TO (30);
+CREATE TABLE part3 PARTITION OF range_parted2 FOR VALUES FROM (30) TO (40);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30);
+ERROR:  partition "fail_part" would overlap partition "part2"
+LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30);
+                                                                   ^
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50);
+ERROR:  partition "fail_part" would overlap partition "part2"
+LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50);
+                                                                   ^
+-- Create a default partition for range partitioned table
+CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT;
+-- More than one default partition is not allowed, so this should give error
+CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT;
+ERROR:  partition "fail_default_part" conflicts with existing default partition "range2_default"
+LINE 1: ... TABLE fail_default_part PARTITION OF range_parted2 DEFAULT;
+                                                               ^
+-- Check if the range for default partitions overlap
+INSERT INTO range_parted2 VALUES (85);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90);
+ERROR:  updated partition constraint for default partition "range2_default" would be violated by some row
+CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100);
+-- now check for multi-column range partition key
+CREATE TABLE range_parted3 (
+	a int,
+	b int
+) PARTITION BY RANGE (a, (b+1));
+CREATE TABLE part00 PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, maxvalue);
+CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, 1);
+ERROR:  partition "fail_part" would overlap partition "part00"
+LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalu...
+                                                             ^
+CREATE TABLE part10 PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, 1);
+CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10);
+CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue);
+CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20);
+ERROR:  partition "fail_part" would overlap partition "part12"
+LINE 1: ...rt PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1,...
+                                                             ^
+CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT;
+-- cannot create a partition that says column b is allowed to range
+-- from -infinity to +infinity, while there exist partitions that have
+-- more specific ranges
+CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue);
+ERROR:  partition "fail_part" would overlap partition "part10"
+LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalu...
+                                                             ^
+-- check for partition bound overlap and other invalid specifications for the hash partition
+CREATE TABLE hash_parted2 (
+	a varchar
+) PARTITION BY HASH (a);
+CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5);
+-- overlap with part_4
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+ERROR:  partition "fail_part" would overlap partition "h2part_4"
+LINE 1: ...LE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODU...
+                                                             ^
+-- modulus must be greater than zero
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+-- remainder must be greater than or equal to zero and less than modulus
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+-- check schema propagation from parent
+CREATE TABLE parted (
+	a text,
+	b int NOT NULL DEFAULT 0,
+	CONSTRAINT check_a CHECK (length(a) > 0)
+) PARTITION BY LIST (a);
+CREATE TABLE part_a PARTITION OF parted FOR VALUES IN ('a');
+-- only inherited attributes (never local ones)
+SELECT attname, attislocal, attinhcount FROM pg_attribute
+  WHERE attrelid = 'part_a'::regclass and attnum > 0
+  ORDER BY attnum;
+ attname | attislocal | attinhcount 
+---------+------------+-------------
+ a       | f          |           1
+ b       | f          |           1
+(2 rows)
+
+-- able to specify column default, column constraint, and table constraint
+-- first check the "column specified more than once" error
+CREATE TABLE part_b PARTITION OF parted (
+	b NOT NULL,
+	b DEFAULT 1,
+	b CHECK (b >= 0),
+	CONSTRAINT check_a CHECK (length(a) > 0)
+) FOR VALUES IN ('b');
+ERROR:  column "b" specified more than once
+CREATE TABLE part_b PARTITION OF parted (
+	b NOT NULL DEFAULT 1,
+	CONSTRAINT check_a CHECK (length(a) > 0),
+	CONSTRAINT check_b CHECK (b >= 0)
+) FOR VALUES IN ('b');
+NOTICE:  merging constraint "check_a" with inherited definition
+-- conislocal should be false for any merged constraints, true otherwise
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass ORDER BY conislocal, coninhcount;
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+ t          |           0
+(2 rows)
+
+-- Once check_b is added to the parent, it should be made non-local for part_b
+ALTER TABLE parted ADD CONSTRAINT check_b CHECK (b >= 0);
+NOTICE:  merging constraint "check_b" with inherited definition
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass;
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+ f          |           1
+(2 rows)
+
+-- Neither check_a nor check_b are droppable from part_b
+ALTER TABLE part_b DROP CONSTRAINT check_a;
+ERROR:  cannot drop inherited constraint "check_a" of relation "part_b"
+ALTER TABLE part_b DROP CONSTRAINT check_b;
+ERROR:  cannot drop inherited constraint "check_b" of relation "part_b"
+-- And dropping it from parted should leave no trace of them on part_b, unlike
+-- traditional inheritance where they will be left behind, because they would
+-- be local constraints.
+ALTER TABLE parted DROP CONSTRAINT check_a, DROP CONSTRAINT check_b;
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass;
+ conislocal | coninhcount 
+------------+-------------
+(0 rows)
+
+-- specify PARTITION BY for a partition
+CREATE TABLE fail_part_col_not_found PARTITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c);
+ERROR:  column "c" named in partition key does not exist
+LINE 1: ...TITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c);
+                                                                    ^
+CREATE TABLE part_c PARTITION OF parted (b WITH OPTIONS NOT NULL DEFAULT 0) FOR VALUES IN ('c') PARTITION BY RANGE ((b));
+-- create a level-2 partition
+CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10);
+-- check that NOT NULL and default value are inherited correctly
+create table parted_notnull_inh_test (a int default 1, b int not null default 0) partition by list (a);
+create table parted_notnull_inh_test1 partition of parted_notnull_inh_test (a not null, b default 1) for values in (1);
+insert into parted_notnull_inh_test (b) values (null);
+ERROR:  null value in column "b" of relation "parted_notnull_inh_test1" violates not-null constraint
+DETAIL:  Failing row contains (1, null).
+-- note that while b's default is overriden, a's default is preserved
+\d parted_notnull_inh_test1
+      Table "public.parted_notnull_inh_test1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 1
+ b      | integer |           | not null | 1
+Partition of: parted_notnull_inh_test FOR VALUES IN (1)
+
+drop table parted_notnull_inh_test;
+-- check that collations are assigned in partition bound expressions
+create table parted_boolean_col (a bool, b text) partition by list(a);
+create table parted_boolean_less partition of parted_boolean_col
+  for values in ('foo' < 'bar');
+create table parted_boolean_greater partition of parted_boolean_col
+  for values in ('foo' > 'bar');
+drop table parted_boolean_col;
+-- check for a conflicting COLLATE clause
+create table parted_collate_must_match (a text collate "C", b text collate "C")
+  partition by range (a);
+-- on the partition key
+create table parted_collate_must_match1 partition of parted_collate_must_match
+  (a collate "POSIX") for values from ('a') to ('m');
+-- on another column
+create table parted_collate_must_match2 partition of parted_collate_must_match
+  (b collate "POSIX") for values from ('m') to ('z');
+drop table parted_collate_must_match;
+-- check that non-matching collations for partition bound
+-- expressions are coerced to the right collation
+create table test_part_coll_posix (a text) partition by range (a collate "POSIX");
+-- ok, collation is implicitly coerced
+create table test_part_coll partition of test_part_coll_posix for values from ('a' collate "C") to ('g');
+-- ok
+create table test_part_coll2 partition of test_part_coll_posix for values from ('g') to ('m');
+-- ok, collation is implicitly coerced
+create table test_part_coll_cast partition of test_part_coll_posix for values from (name 'm' collate "C") to ('s');
+-- ok; partition collation silently overrides the default collation of type 'name'
+create table test_part_coll_cast2 partition of test_part_coll_posix for values from (name 's') to ('z');
+drop table test_part_coll_posix;
+-- Partition bound in describe output
+\d+ part_b
+                                   Table "public.part_b"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           | not null | 1       | plain    |              | 
+Partition of: parted FOR VALUES IN ('b')
+Partition constraint: ((a IS NOT NULL) AND (a = 'b'::text))
+
+-- Both partition bound and partition key in describe output
+\d+ part_c
+                             Partitioned table "public.part_c"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           | not null | 0       | plain    |              | 
+Partition of: parted FOR VALUES IN ('c')
+Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text))
+Partition key: RANGE (b)
+Partitions: part_c_1_10 FOR VALUES FROM (1) TO (10)
+
+-- a level-2 partition's constraint will include the parent's expressions
+\d+ part_c_1_10
+                                Table "public.part_c_1_10"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           | not null | 0       | plain    |              | 
+Partition of: part_c FOR VALUES FROM (1) TO (10)
+Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text) AND (b IS NOT NULL) AND (b >= 1) AND (b < 10))
+
+-- Show partition count in the parent's describe output
+-- Tempted to include \d+ output listing partitions with bound info but
+-- output could vary depending on the order in which partition oids are
+-- returned.
+\d parted
+         Partitioned table "public.parted"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | text    |           |          | 
+ b      | integer |           | not null | 0
+Partition key: LIST (a)
+Number of partitions: 3 (Use \d+ to list them.)
+
+\d hash_parted
+      Partitioned table "public.hash_parted"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: HASH (a)
+Number of partitions: 3 (Use \d+ to list them.)
+
+-- check that we get the expected partition constraints
+CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c);
+CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE);
+\d+ unbounded_range_part
+                           Table "public.unbounded_range_part"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL))
+
+DROP TABLE unbounded_range_part;
+CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE);
+\d+ range_parted4_1
+                              Table "public.range_parted4_1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND (abs(a) <= 1))
+
+CREATE TABLE range_parted4_2 PARTITION OF range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE);
+\d+ range_parted4_2
+                              Table "public.range_parted4_2"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 3) OR ((abs(a) = 3) AND (abs(b) > 4)) OR ((abs(a) = 3) AND (abs(b) = 4) AND (c >= 5))) AND ((abs(a) < 6) OR ((abs(a) = 6) AND (abs(b) <= 7))))
+
+CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE);
+\d+ range_parted4_3
+                              Table "public.range_parted4_3"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 6) OR ((abs(a) = 6) AND (abs(b) >= 8))) AND (abs(a) <= 9))
+
+DROP TABLE range_parted4;
+-- user-defined operator class in partition key
+CREATE FUNCTION my_int4_sort(int4,int4) RETURNS int LANGUAGE sql
+  AS $$ SELECT CASE WHEN $1 = $2 THEN 0 WHEN $1 > $2 THEN 1 ELSE -1 END; $$;
+CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING btree AS
+  OPERATOR 1 < (int4,int4), OPERATOR 2 <= (int4,int4),
+  OPERATOR 3 = (int4,int4), OPERATOR 4 >= (int4,int4),
+  OPERATOR 5 > (int4,int4), FUNCTION 1 my_int4_sort(int4,int4);
+CREATE TABLE partkey_t (a int4) PARTITION BY RANGE (a test_int4_ops);
+CREATE TABLE partkey_t_1 PARTITION OF partkey_t FOR VALUES FROM (0) TO (1000);
+INSERT INTO partkey_t VALUES (100);
+INSERT INTO partkey_t VALUES (200);
+-- cleanup
+DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3;
+DROP TABLE partkey_t, hash_parted, hash_parted2;
+DROP OPERATOR CLASS test_int4_ops USING btree;
+DROP FUNCTION my_int4_sort(int4,int4);
+-- comments on partitioned tables columns
+CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a);
+COMMENT ON TABLE parted_col_comment IS 'Am partitioned table';
+COMMENT ON COLUMN parted_col_comment.a IS 'Partition key';
+SELECT obj_description('parted_col_comment'::regclass);
+   obj_description    
+----------------------
+ Am partitioned table
+(1 row)
+
+\d+ parted_col_comment
+                        Partitioned table "public.parted_col_comment"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target |  Description  
+--------+---------+-----------+----------+---------+----------+--------------+---------------
+ a      | integer |           |          |         | plain    |              | Partition key
+ b      | text    |           |          |         | extended |              | 
+Partition key: LIST (a)
+Number of partitions: 0
+
+DROP TABLE parted_col_comment;
+-- list partitioning on array type column
+CREATE TABLE arrlp (a int[]) PARTITION BY LIST (a);
+CREATE TABLE arrlp12 PARTITION OF arrlp FOR VALUES IN ('{1}', '{2}');
+\d+ arrlp12
+                                   Table "public.arrlp12"
+ Column |   Type    | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+-----------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer[] |           |          |         | extended |              | 
+Partition of: arrlp FOR VALUES IN ('{1}', '{2}')
+Partition constraint: ((a IS NOT NULL) AND ((a = '{1}'::integer[]) OR (a = '{2}'::integer[])))
+
+DROP TABLE arrlp;
+-- partition on boolean column
+create table boolspart (a bool) partition by list (a);
+create table boolspart_t partition of boolspart for values in (true);
+create table boolspart_f partition of boolspart for values in (false);
+\d+ boolspart
+                           Partitioned table "public.boolspart"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | boolean |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Partitions: boolspart_f FOR VALUES IN (false),
+            boolspart_t FOR VALUES IN (true)
+
+drop table boolspart;
+-- partitions mixing temporary and permanent relations
+create table perm_parted (a int) partition by list (a);
+create temporary table temp_parted (a int) partition by list (a);
+create table perm_part partition of temp_parted default; -- error
+ERROR:  cannot create a permanent relation as partition of temporary relation "temp_parted"
+create temp table temp_part partition of perm_parted default; -- error
+ERROR:  cannot create a temporary relation as partition of permanent relation "perm_parted"
+create temp table temp_part partition of temp_parted default; -- ok
+drop table perm_parted cascade;
+drop table temp_parted cascade;
+-- check that adding partitions to a table while it is being used is prevented
+create table tab_part_create (a int) partition by list (a);
+create or replace function func_part_create() returns trigger
+  language plpgsql as $$
+  begin
+    execute 'create table tab_part_create_1 partition of tab_part_create for values in (1)';
+    return null;
+  end $$;
+create trigger trig_part_create before insert on tab_part_create
+  for each statement execute procedure func_part_create();
+insert into tab_part_create values (1);
+ERROR:  cannot CREATE TABLE .. PARTITION OF "tab_part_create" because it is being used by active queries in this session
+CONTEXT:  SQL statement "create table tab_part_create_1 partition of tab_part_create for values in (1)"
+PL/pgSQL function func_part_create() line 3 at EXECUTE
+drop table tab_part_create;
+drop function func_part_create();
+-- test using a volatile expression as partition bound
+create table volatile_partbound_test (partkey timestamp) partition by range (partkey);
+create table volatile_partbound_test1 partition of volatile_partbound_test for values from (minvalue) to (current_timestamp);
+create table volatile_partbound_test2 partition of volatile_partbound_test for values from (current_timestamp) to (maxvalue);
+-- this should go into the partition volatile_partbound_test2
+insert into volatile_partbound_test values (current_timestamp);
+select tableoid::regclass from volatile_partbound_test;
+         tableoid         
+--------------------------
+ volatile_partbound_test2
+(1 row)
+
+drop table volatile_partbound_test;
+-- test the case where a check constraint on default partition allows
+-- to avoid scanning it when adding a new partition
+create table defcheck (a int, b int) partition by list (b);
+create table defcheck_def (a int, c int, b int);
+alter table defcheck_def drop c;
+alter table defcheck attach partition defcheck_def default;
+alter table defcheck_def add check (b <= 0 and b is not null);
+create table defcheck_1 partition of defcheck for values in (1, null);
+-- test that complex default partition constraints are enforced correctly
+insert into defcheck_def values (0, 0);
+create table defcheck_0 partition of defcheck for values in (0);
+ERROR:  updated partition constraint for default partition "defcheck_def" would be violated by some row
+drop table defcheck;
+-- tests of column drop with partition tables and indexes using
+-- predicates and expressions.
+create table part_column_drop (
+  useless_1 int,
+  id int,
+  useless_2 int,
+  d int,
+  b int,
+  useless_3 int
+) partition by range (id);
+alter table part_column_drop drop column useless_1;
+alter table part_column_drop drop column useless_2;
+alter table part_column_drop drop column useless_3;
+create index part_column_drop_b_pred on part_column_drop(b) where b = 1;
+create index part_column_drop_b_expr on part_column_drop((b = 1));
+create index part_column_drop_d_pred on part_column_drop(d) where d = 2;
+create index part_column_drop_d_expr on part_column_drop((d = 2));
+create table part_column_drop_1_10 partition of
+  part_column_drop for values from (1) to (10);
+\d part_column_drop
+    Partitioned table "public.part_column_drop"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ d      | integer |           |          | 
+ b      | integer |           |          | 
+Partition key: RANGE (id)
+Indexes:
+    "part_column_drop_b_expr" btree ((b = 1))
+    "part_column_drop_b_pred" btree (b) WHERE b = 1
+    "part_column_drop_d_expr" btree ((d = 2))
+    "part_column_drop_d_pred" btree (d) WHERE d = 2
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d part_column_drop_1_10
+       Table "public.part_column_drop_1_10"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ d      | integer |           |          | 
+ b      | integer |           |          | 
+Partition of: part_column_drop FOR VALUES FROM (1) TO (10)
+Indexes:
+    "part_column_drop_1_10_b_idx" btree (b) WHERE b = 1
+    "part_column_drop_1_10_d_idx" btree (d) WHERE d = 2
+    "part_column_drop_1_10_expr_idx" btree ((b = 1))
+    "part_column_drop_1_10_expr_idx1" btree ((d = 2))
+
+drop table part_column_drop;

From f29ac2c909d21353aa404d0ee3ff3f32ad75b90f Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:42:23 +0300
Subject: [PATCH 009/214] fix_pg_table_size.patch

Request relation size via smgr function, not just stat(filepath).
---
 src/backend/utils/adt/dbsize.c | 38 ++++++----------------------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 9de2ed09d99..ade36f28be5 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -23,6 +23,7 @@
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "storage/fd.h"
+#include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/numeric.h"
@@ -272,39 +273,12 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS)
 static int64
 calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum)
 {
-	int64		totalsize = 0;
-	char	   *relationpath;
-	char		pathname[MAXPGPATH];
-	unsigned int segcount = 0;
-
-	relationpath = relpathbackend(*rfn, backend, forknum);
-
-	for (segcount = 0;; segcount++)
-	{
-		struct stat fst;
-
-		CHECK_FOR_INTERRUPTS();
-
-		if (segcount == 0)
-			snprintf(pathname, MAXPGPATH, "%s",
-					 relationpath);
-		else
-			snprintf(pathname, MAXPGPATH, "%s.%u",
-					 relationpath, segcount);
-
-		if (stat(pathname, &fst) < 0)
-		{
-			if (errno == ENOENT)
-				break;
-			else
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not stat file \"%s\": %m", pathname)));
-		}
-		totalsize += fst.st_size;
+	SMgrRelation  srel = smgropen(*rfn, backend);
+	if (smgrexists(srel, forknum))	{
+		BlockNumber n = smgrnblocks(srel, forknum);
+		return (int64)n*BLCKSZ;
 	}
-
-	return totalsize;
+	return 0;
 }
 
 Datum

From 0df368472c640c4379c54622bffd257d6a7dcfaf Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:42:48 +0300
Subject: [PATCH 010/214] [walredo] fix_gin_redo.patch

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/access/gin/ginxlog.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 09ce4d6a5ba..261ab868660 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record)
 				rootbuf;
 	bool		isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
 	bool		isRoot = (data->flags & GIN_SPLIT_ROOT) != 0;
+	XLogRedoAction action;
 
 	/*
 	 * First clear incomplete-split flag on child page if this finishes a
@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
 	if (!isLeaf)
 		ginRedoClearIncompleteSplit(record, 3);
 
-	if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+	action = XLogReadBufferForRedo(record, 0, &lbuffer);
+	if (action != BLK_RESTORED && action != BLK_DONE)
 		elog(ERROR, "GIN split record did not contain a full-page image of left page");
 
-	if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED)
+	action = XLogReadBufferForRedo(record, 1, &rbuffer);
+	if (action != BLK_RESTORED && action != BLK_DONE)
 		elog(ERROR, "GIN split record did not contain a full-page image of right page");
 
 	if (isRoot)
 	{
-		if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED)
+		action = XLogReadBufferForRedo(record, 2, &rootbuf);
+		if (action != BLK_RESTORED && action != BLK_DONE)
 			elog(ERROR, "GIN split record did not contain a full-page image of root page");
-		UnlockReleaseBuffer(rootbuf);
+		if (rootbuf != InvalidBuffer)
+			UnlockReleaseBuffer(rootbuf);
 	}
 
-	UnlockReleaseBuffer(rbuffer);
-	UnlockReleaseBuffer(lbuffer);
+	if (rbuffer != InvalidBuffer)
+		UnlockReleaseBuffer(rbuffer);
+	if (lbuffer != InvalidBuffer)
+		UnlockReleaseBuffer(lbuffer);
 }
 
 /*

From de384f2da4605dab37cf34f942053a9e3ae328e7 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:43:15 +0300
Subject: [PATCH 011/214] [walredo] fix_brin_redo.patch

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/access/brin/brin_xlog.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
index 3519038b709..3e58a1aa79f 100644
--- a/src/backend/access/brin/brin_xlog.c
+++ b/src/backend/access/brin/brin_xlog.c
@@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record,
 	}
 
 	/* need this page's blkno to store in revmap */
-	regpgno = BufferGetBlockNumber(buffer);
+	//ZENITH XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer.
+	XLogRecGetBlockTag(record, 0, NULL, NULL, &regpgno);
 
 	/* insert the index item into the page */
 	if (action == BLK_NEEDS_REDO)

From 860f839754f7dc433207f5e38876d1e50ce11f0d Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:43:49 +0300
Subject: [PATCH 012/214] speculative_records_workaround.patch

---
 src/backend/access/heap/heapam.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e92e2570268..060e9cb741b 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8978,7 +8978,7 @@ heap_xlog_insert(XLogReaderState *record)
 
 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
 	ItemPointerSetBlockNumber(&target_tid, blkno);
-	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
+	ItemPointerSetOffsetNumber(&target_tid, (xlrec->flags & XLH_INSERT_IS_SPECULATIVE) ? SpecTokenOffsetNumber : xlrec->offnum);
 
 	/*
 	 * The visibility map may need to be fixed even if the heap page is

From 55dccb723885ec6ee1ca78efaa026ab4c82ba3c2 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:45:56 +0300
Subject: [PATCH 013/214] wallog_t_ctid.patch

---
 src/backend/access/heap/heapam.c | 24 ++++++++++++++++--------
 src/include/access/heapam_xlog.h |  6 +++++-
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 060e9cb741b..116ed825c83 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2217,6 +2217,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
 		xlhdr.t_infomask = heaptup->t_data->t_infomask;
 		xlhdr.t_hoff = heaptup->t_data->t_hoff;
+		xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data);
 
 		/*
 		 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
@@ -2535,6 +2536,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 
 				tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
 				tuphdr->t_infomask = heaptup->t_data->t_infomask;
+				tuphdr->t_cid =  HeapTupleHeaderGetRawCommandId(heaptup->t_data);
 				tuphdr->t_hoff = heaptup->t_data->t_hoff;
 
 				/* write bitmap [+ padding] [+ oid] + data */
@@ -3052,7 +3054,7 @@ heap_delete(Relation relation, ItemPointer tid,
 											  tp.t_data->t_infomask2);
 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
 		xlrec.xmax = new_xmax;
-
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data);
 		if (old_key_tuple != NULL)
 		{
 			if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
@@ -3073,6 +3075,7 @@ heap_delete(Relation relation, ItemPointer tid,
 		{
 			xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
 			xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
+			xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data);
 			xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
 
 			XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
@@ -3800,6 +3803,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 												  oldtup.t_data->t_infomask2);
 			xlrec.flags =
 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
+			xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data);
 			XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
 			recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
 			PageSetLSN(page, recptr);
@@ -4988,6 +4992,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple,
 		xlrec.infobits_set = compute_infobits(new_infomask,
 											  tuple->t_data->t_infomask2);
 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data);
 		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
 
 		/* we don't decode row locks atm, so no need to log the origin */
@@ -6037,6 +6042,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
 											  tp.t_data->t_infomask2);
 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
 		xlrec.xmax = xid;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data);
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
@@ -8224,7 +8230,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	/* Prepare WAL data for the new page */
 	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
-
+	xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data);
 	bufflags = REGBUF_STANDARD;
 	if (init)
 		bufflags |= REGBUF_WILL_INIT;
@@ -8261,6 +8267,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
 	xlhdr.t_infomask = newtup->t_data->t_infomask;
 	xlhdr.t_hoff = newtup->t_data->t_hoff;
+	xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data);
 	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
 
 	/*
@@ -8302,6 +8309,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 		xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
 		xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
 		xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
+		xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data);
 
 		XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
 
@@ -8935,7 +8943,7 @@ heap_xlog_delete(XLogReaderState *record)
 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
 		else
 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
-		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 
 		/* Mark the page as a candidate for pruning */
 		PageSetPrunable(page, XLogRecGetXid(record));
@@ -9036,7 +9044,7 @@ heap_xlog_insert(XLogReaderState *record)
 		htup->t_infomask = xlhdr.t_infomask;
 		htup->t_hoff = xlhdr.t_hoff;
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, FirstCommandId);
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
 		htup->t_ctid = target_tid;
 
 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
@@ -9179,7 +9187,7 @@ heap_xlog_multi_insert(XLogReaderState *record)
 			htup->t_infomask = xlhdr->t_infomask;
 			htup->t_hoff = xlhdr->t_hoff;
 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-			HeapTupleHeaderSetCmin(htup, FirstCommandId);
+			HeapTupleHeaderSetCmin(htup, xlhdr->t_cid);
 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
 
@@ -9319,7 +9327,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update)
 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
 								   &htup->t_infomask2);
 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
-		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = newtid;
 
@@ -9452,7 +9460,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update)
 		htup->t_hoff = xlhdr.t_hoff;
 
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, FirstCommandId);
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = newtid;
@@ -9593,7 +9601,7 @@ heap_xlog_lock(XLogReaderState *record)
 						   offnum);
 		}
 		HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
-		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
 	}
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 27db48184e6..e6d31be5222 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -108,6 +108,7 @@ typedef struct xl_heap_delete
 {
 	TransactionId xmax;			/* xmax of the deleted tuple */
 	OffsetNumber offnum;		/* deleted tuple's offset */
+	uint32      t_cid;
 	uint8		infobits_set;	/* infomask bits */
 	uint8		flags;
 } xl_heap_delete;
@@ -145,6 +146,7 @@ typedef struct xl_heap_header
 {
 	uint16		t_infomask2;
 	uint16		t_infomask;
+	uint32      t_cid;
 	uint8		t_hoff;
 } xl_heap_header;
 
@@ -186,6 +188,7 @@ typedef struct xl_multi_insert_tuple
 	uint16		datalen;		/* size of tuple data that follows */
 	uint16		t_infomask2;
 	uint16		t_infomask;
+	uint32      t_cid;
 	uint8		t_hoff;
 	/* TUPLE DATA FOLLOWS AT END OF STRUCT */
 } xl_multi_insert_tuple;
@@ -215,9 +218,9 @@ typedef struct xl_heap_update
 	OffsetNumber old_offnum;	/* old tuple's offset */
 	uint8		old_infobits_set;	/* infomask bits to set on old tuple */
 	uint8		flags;
+	uint32       t_cid;
 	TransactionId new_xmax;		/* xmax of the new tuple */
 	OffsetNumber new_offnum;	/* new tuple's offset */
-
 	/*
 	 * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags
 	 * are set, xl_heap_header and tuple data for the old tuple follow.
@@ -279,6 +282,7 @@ typedef struct xl_heap_lock
 {
 	TransactionId locking_xid;	/* might be a MultiXactId not xid */
 	OffsetNumber offnum;		/* locked tuple's offset on page */
+	uint32       t_cid;
 	int8		infobits_set;	/* infomask and infomask2 bits to set */
 	uint8		flags;			/* XLH_LOCK_* flag bits */
 } xl_heap_lock;

From 94c7f115c7f2bf0d6a7c77d93ab18cf98206fb8e Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:47:00 +0300
Subject: [PATCH 014/214] vacuumlazy_debug_stub.patch

---
 src/backend/access/heap/vacuumlazy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 8aab6e324e0..c684c4fbee3 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
 				 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
 		{
-			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+			/* ZENITH-XXX: all visible hint is not wal-logged
+			 * FIXME: Replay visibilitymap changes in pageserver
+			 */
+			elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
 				 vacrel->relname, blkno);
 			visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
 								VISIBILITYMAP_VALID_BITS);

From a5db8dc2d0bfa39a9073f84dc4f4c598a6f039f1 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:48:33 +0300
Subject: [PATCH 015/214] [test] zenith_test_evict.patch

---
 src/backend/storage/buffer/bufmgr.c | 29 +++++++++++++++++++++++++++++
 src/backend/utils/misc/guc.c        | 11 +++++++++++
 src/include/storage/bufmgr.h        |  2 ++
 3 files changed, 42 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b9b94fa71eb..8b605774464 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -157,6 +157,9 @@ int			checkpoint_flush_after = 0;
 int			bgwriter_flush_after = 0;
 int			backend_flush_after = 0;
 
+/* Evict unpinned pages (for better test coverage) */
+bool		zenith_test_evict = false;
+
 /* local state for StartBufferIO and related functions */
 static BufferDesc *InProgressBuf = NULL;
 static bool IsForInput;
@@ -1924,6 +1927,32 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 				UnlockBufHdr(buf, buf_state);
 		}
 		ForgetPrivateRefCountEntry(ref);
+
+		if (zenith_test_evict && !InRecovery)
+		{
+			buf_state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(buf_state) == 0)
+			{
+				if (buf_state & BM_DIRTY)
+				{
+					ReservePrivateRefCountEntry();
+					PinBuffer_Locked(buf);
+					if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+												 LW_SHARED))
+					{
+						FlushOneBuffer(b);
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
+					}
+					UnpinBuffer(buf, true);
+				}
+				else
+				{
+					InvalidateBuffer(buf);
+				}
+			}
+			else
+				UnlockBufHdr(buf, buf_state);
+		}
 	}
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3f663c2b65f..6a52c2f56d8 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -88,6 +88,7 @@
 #include "storage/pg_shmem.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
+#include "storage/smgr.h"
 #include "storage/standby.h"
 #include "tcop/tcopprot.h"
 #include "tsearch/ts_cache.h"
@@ -2125,6 +2126,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"zenith_test_evict", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Evict unpinned pages (for better test coverage)"),
+		},
+		&zenith_test_evict,
+		false,
+		NULL, NULL, NULL
+	},
+
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index aa64fb42ec4..6d140786c74 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -76,6 +76,8 @@ extern int	checkpoint_flush_after;
 extern int	backend_flush_after;
 extern int	bgwriter_flush_after;
 
+extern bool	zenith_test_evict;
+
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
 

From d782c1b1d07be45d21473f2f46f84f6258aba006 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:53:07 +0300
Subject: [PATCH 016/214] fix_sequence_wallogging.patch

---
 src/backend/commands/sequence.c          |   4 +-
 src/test/regress/expected/sequence_1.out | 824 +++++++++++++++++++++++
 2 files changed, 827 insertions(+), 1 deletion(-)
 create mode 100644 src/test/regress/expected/sequence_1.out

diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 98649986e15..109dbd6a944 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -53,7 +53,9 @@
  * so we pre-log a few fetches in advance. In the event of
  * crash we can lose (skip over) as many values as we pre-logged.
  */
-#define SEQ_LOG_VALS	32
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS	32 */
+#define SEQ_LOG_VALS	0
 
 /*
  * The "special area" of a sequence's buffer page looks like this.
diff --git a/src/test/regress/expected/sequence_1.out b/src/test/regress/expected/sequence_1.out
new file mode 100644
index 00000000000..462e3f3caa4
--- /dev/null
+++ b/src/test/regress/expected/sequence_1.out
@@ -0,0 +1,824 @@
+--
+-- CREATE SEQUENCE
+--
+-- various error cases
+CREATE UNLOGGED SEQUENCE sequence_testx;
+ERROR:  unlogged sequences are not supported
+CREATE SEQUENCE sequence_testx INCREMENT BY 0;
+ERROR:  INCREMENT must not be zero
+CREATE SEQUENCE sequence_testx INCREMENT BY -1 MINVALUE 20;
+ERROR:  MINVALUE (20) must be less than MAXVALUE (-1)
+CREATE SEQUENCE sequence_testx INCREMENT BY 1 MAXVALUE -20;
+ERROR:  MINVALUE (1) must be less than MAXVALUE (-20)
+CREATE SEQUENCE sequence_testx INCREMENT BY -1 START 10;
+ERROR:  START value (10) cannot be greater than MAXVALUE (-1)
+CREATE SEQUENCE sequence_testx INCREMENT BY 1 START -10;
+ERROR:  START value (-10) cannot be less than MINVALUE (1)
+CREATE SEQUENCE sequence_testx CACHE 0;
+ERROR:  CACHE (0) must be greater than zero
+-- OWNED BY errors
+CREATE SEQUENCE sequence_testx OWNED BY nobody;  -- nonsense word
+ERROR:  invalid OWNED BY option
+HINT:  Specify OWNED BY table.column or OWNED BY NONE.
+CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid;  -- not a table
+ERROR:  referenced relation "pg_class_oid_index" is not a table or foreign table
+CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname;  -- not same schema
+ERROR:  sequence must be in same schema as table it is linked to
+CREATE TABLE sequence_test_table (a int);
+CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b;  -- wrong column
+ERROR:  column "b" of relation "sequence_test_table" does not exist
+DROP TABLE sequence_test_table;
+-- sequence data types
+CREATE SEQUENCE sequence_test5 AS integer;
+CREATE SEQUENCE sequence_test6 AS smallint;
+CREATE SEQUENCE sequence_test7 AS bigint;
+CREATE SEQUENCE sequence_test8 AS integer MAXVALUE 100000;
+CREATE SEQUENCE sequence_test9 AS integer INCREMENT BY -1;
+CREATE SEQUENCE sequence_test10 AS integer MINVALUE -100000 START 1;
+CREATE SEQUENCE sequence_test11 AS smallint;
+CREATE SEQUENCE sequence_test12 AS smallint INCREMENT -1;
+CREATE SEQUENCE sequence_test13 AS smallint MINVALUE -32768;
+CREATE SEQUENCE sequence_test14 AS smallint MAXVALUE 32767 INCREMENT -1;
+CREATE SEQUENCE sequence_testx AS text;
+ERROR:  sequence type must be smallint, integer, or bigint
+CREATE SEQUENCE sequence_testx AS nosuchtype;
+ERROR:  type "nosuchtype" does not exist
+LINE 1: CREATE SEQUENCE sequence_testx AS nosuchtype;
+                                          ^
+CREATE SEQUENCE sequence_testx AS smallint MAXVALUE 100000;
+ERROR:  MAXVALUE (100000) is out of range for sequence data type smallint
+CREATE SEQUENCE sequence_testx AS smallint MINVALUE -100000;
+ERROR:  MINVALUE (-100000) is out of range for sequence data type smallint
+ALTER SEQUENCE sequence_test5 AS smallint;  -- success, max will be adjusted
+ALTER SEQUENCE sequence_test8 AS smallint;  -- fail, max has to be adjusted
+ERROR:  MAXVALUE (100000) is out of range for sequence data type smallint
+ALTER SEQUENCE sequence_test8 AS smallint MAXVALUE 20000;  -- ok now
+ALTER SEQUENCE sequence_test9 AS smallint;  -- success, min will be adjusted
+ALTER SEQUENCE sequence_test10 AS smallint;  -- fail, min has to be adjusted
+ERROR:  MINVALUE (-100000) is out of range for sequence data type smallint
+ALTER SEQUENCE sequence_test10 AS smallint MINVALUE -20000;  -- ok now
+ALTER SEQUENCE sequence_test11 AS int;  -- max will be adjusted
+ALTER SEQUENCE sequence_test12 AS int;  -- min will be adjusted
+ALTER SEQUENCE sequence_test13 AS int;  -- min and max will be adjusted
+ALTER SEQUENCE sequence_test14 AS int;  -- min and max will be adjusted
+---
+--- test creation of SERIAL column
+---
+CREATE TABLE serialTest1 (f1 text, f2 serial);
+INSERT INTO serialTest1 VALUES ('foo');
+INSERT INTO serialTest1 VALUES ('bar');
+INSERT INTO serialTest1 VALUES ('force', 100);
+INSERT INTO serialTest1 VALUES ('wrong', NULL);
+ERROR:  null value in column "f2" of relation "serialtest1" violates not-null constraint
+DETAIL:  Failing row contains (wrong, null).
+SELECT * FROM serialTest1;
+  f1   | f2  
+-------+-----
+ foo   |   1
+ bar   |   2
+ force | 100
+(3 rows)
+
+SELECT pg_get_serial_sequence('serialTest1', 'f2');
+  pg_get_serial_sequence   
+---------------------------
+ public.serialtest1_f2_seq
+(1 row)
+
+-- test smallserial / bigserial
+CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2,
+  f5 bigserial, f6 serial8);
+INSERT INTO serialTest2 (f1)
+  VALUES ('test_defaults');
+INSERT INTO serialTest2 (f1, f2, f3, f4, f5, f6)
+  VALUES ('test_max_vals', 2147483647, 32767, 32767, 9223372036854775807,
+          9223372036854775807),
+         ('test_min_vals', -2147483648, -32768, -32768, -9223372036854775808,
+          -9223372036854775808);
+-- All these INSERTs should fail:
+INSERT INTO serialTest2 (f1, f3)
+  VALUES ('bogus', -32769);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f4)
+  VALUES ('bogus', -32769);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f3)
+  VALUES ('bogus', 32768);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f4)
+  VALUES ('bogus', 32768);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f5)
+  VALUES ('bogus', -9223372036854775809);
+ERROR:  bigint out of range
+INSERT INTO serialTest2 (f1, f6)
+  VALUES ('bogus', -9223372036854775809);
+ERROR:  bigint out of range
+INSERT INTO serialTest2 (f1, f5)
+  VALUES ('bogus', 9223372036854775808);
+ERROR:  bigint out of range
+INSERT INTO serialTest2 (f1, f6)
+  VALUES ('bogus', 9223372036854775808);
+ERROR:  bigint out of range
+SELECT * FROM serialTest2 ORDER BY f2 ASC;
+      f1       |     f2      |   f3   |   f4   |          f5          |          f6          
+---------------+-------------+--------+--------+----------------------+----------------------
+ test_min_vals | -2147483648 | -32768 | -32768 | -9223372036854775808 | -9223372036854775808
+ test_defaults |           1 |      1 |      1 |                    1 |                    1
+ test_max_vals |  2147483647 |  32767 |  32767 |  9223372036854775807 |  9223372036854775807
+(3 rows)
+
+SELECT nextval('serialTest2_f2_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f3_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f4_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f5_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f6_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+-- basic sequence operations using both text and oid references
+CREATE SEQUENCE sequence_test;
+CREATE SEQUENCE IF NOT EXISTS sequence_test;
+NOTICE:  relation "sequence_test" already exists, skipping
+SELECT nextval('sequence_test'::text);
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('sequence_test'::regclass);
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT currval('sequence_test'::text);
+ currval 
+---------
+       2
+(1 row)
+
+SELECT currval('sequence_test'::regclass);
+ currval 
+---------
+       2
+(1 row)
+
+SELECT setval('sequence_test'::text, 32);
+ setval 
+--------
+     32
+(1 row)
+
+SELECT nextval('sequence_test'::regclass);
+ nextval 
+---------
+      33
+(1 row)
+
+SELECT setval('sequence_test'::text, 99, false);
+ setval 
+--------
+     99
+(1 row)
+
+SELECT nextval('sequence_test'::regclass);
+ nextval 
+---------
+      99
+(1 row)
+
+SELECT setval('sequence_test'::regclass, 32);
+ setval 
+--------
+     32
+(1 row)
+
+SELECT nextval('sequence_test'::text);
+ nextval 
+---------
+      33
+(1 row)
+
+SELECT setval('sequence_test'::regclass, 99, false);
+ setval 
+--------
+     99
+(1 row)
+
+SELECT nextval('sequence_test'::text);
+ nextval 
+---------
+      99
+(1 row)
+
+DISCARD SEQUENCES;
+SELECT currval('sequence_test'::regclass);
+ERROR:  currval of sequence "sequence_test" is not yet defined in this session
+DROP SEQUENCE sequence_test;
+-- renaming sequences
+CREATE SEQUENCE foo_seq;
+ALTER TABLE foo_seq RENAME TO foo_seq_new;
+SELECT * FROM foo_seq_new;
+ last_value | log_cnt | is_called 
+------------+---------+-----------
+          1 |       0 | f
+(1 row)
+
+SELECT nextval('foo_seq_new');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('foo_seq_new');
+ nextval 
+---------
+       2
+(1 row)
+
+-- log_cnt can be higher if there is a checkpoint just at the right
+-- time, so just test for the expected range
+SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new;
+ last_value | log_cnt_ok | is_called 
+------------+------------+-----------
+          2 | f          | t
+(1 row)
+
+DROP SEQUENCE foo_seq_new;
+-- renaming serial sequences
+ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo;
+INSERT INTO serialTest1 VALUES ('more');
+SELECT * FROM serialTest1;
+  f1   | f2  
+-------+-----
+ foo   |   1
+ bar   |   2
+ force | 100
+ more  |   3
+(4 rows)
+
+--
+-- Check dependencies of serial and ordinary sequences
+--
+CREATE TEMP SEQUENCE myseq2;
+CREATE TEMP SEQUENCE myseq3;
+CREATE TEMP TABLE t1 (
+  f1 serial,
+  f2 int DEFAULT nextval('myseq2'),
+  f3 int DEFAULT nextval('myseq3'::text)
+);
+-- Both drops should fail, but with different error messages:
+DROP SEQUENCE t1_f1_seq;
+ERROR:  cannot drop sequence t1_f1_seq because other objects depend on it
+DETAIL:  default value for column f1 of table t1 depends on sequence t1_f1_seq
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+DROP SEQUENCE myseq2;
+ERROR:  cannot drop sequence myseq2 because other objects depend on it
+DETAIL:  default value for column f2 of table t1 depends on sequence myseq2
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+-- This however will work:
+DROP SEQUENCE myseq3;
+DROP TABLE t1;
+-- Fails because no longer existent:
+DROP SEQUENCE t1_f1_seq;
+ERROR:  sequence "t1_f1_seq" does not exist
+-- Now OK:
+DROP SEQUENCE myseq2;
+--
+-- Alter sequence
+--
+ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 24
+  INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE;
+NOTICE:  relation "sequence_test2" does not exist, skipping
+ALTER SEQUENCE serialTest1 CYCLE;  -- error, not a sequence
+ERROR:  "serialtest1" is not a sequence
+CREATE SEQUENCE sequence_test2 START WITH 32;
+CREATE SEQUENCE sequence_test4 INCREMENT BY -1;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+SELECT nextval('sequence_test4');
+ nextval 
+---------
+      -1
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART WITH 0;  -- error
+ERROR:  RESTART value (0) cannot be less than MINVALUE (1)
+ALTER SEQUENCE sequence_test4 RESTART WITH 40;  -- error
+ERROR:  RESTART value (40) cannot be greater than MAXVALUE (-1)
+-- test CYCLE and NO CYCLE
+ALTER SEQUENCE sequence_test2 RESTART WITH 24
+  INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- cycled
+ nextval 
+---------
+       5
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART WITH 24
+  NO CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- error
+ERROR:  nextval: reached maximum value of sequence "sequence_test2" (36)
+ALTER SEQUENCE sequence_test2 RESTART WITH -24 START WITH -24
+  INCREMENT BY -4 MINVALUE -36 MAXVALUE -5 CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- cycled
+ nextval 
+---------
+      -5
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART WITH -24
+  NO CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- error
+ERROR:  nextval: reached minimum value of sequence "sequence_test2" (-36)
+-- reset
+ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 32 START WITH 32
+  INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE;
+SELECT setval('sequence_test2', -100);  -- error
+ERROR:  setval: value -100 is out of bounds for sequence "sequence_test2" (5..36)
+SELECT setval('sequence_test2', 100);  -- error
+ERROR:  setval: value 100 is out of bounds for sequence "sequence_test2" (5..36)
+SELECT setval('sequence_test2', 5);
+ setval 
+--------
+      5
+(1 row)
+
+CREATE SEQUENCE sequence_test3;  -- not read from, to test is_called
+-- Information schema
+SELECT * FROM information_schema.sequences
+  WHERE sequence_name ~ ANY(ARRAY['sequence_test', 'serialtest'])
+  ORDER BY sequence_name ASC;
+ sequence_catalog | sequence_schema |   sequence_name    | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value |    minimum_value     |    maximum_value    | increment | cycle_option 
+------------------+-----------------+--------------------+-----------+-------------------+-------------------------+---------------+-------------+----------------------+---------------------+-----------+--------------
+ regression       | public          | sequence_test10    | smallint  |                16 |                       2 |             0 | 1           | -20000               | 32767               | 1         | NO
+ regression       | public          | sequence_test11    | integer   |                32 |                       2 |             0 | 1           | 1                    | 2147483647          | 1         | NO
+ regression       | public          | sequence_test12    | integer   |                32 |                       2 |             0 | -1          | -2147483648          | -1                  | -1        | NO
+ regression       | public          | sequence_test13    | integer   |                32 |                       2 |             0 | -32768      | -2147483648          | 2147483647          | 1         | NO
+ regression       | public          | sequence_test14    | integer   |                32 |                       2 |             0 | 32767       | -2147483648          | 2147483647          | -1        | NO
+ regression       | public          | sequence_test2     | bigint    |                64 |                       2 |             0 | 32          | 5                    | 36                  | 4         | YES
+ regression       | public          | sequence_test3     | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+ regression       | public          | sequence_test4     | bigint    |                64 |                       2 |             0 | -1          | -9223372036854775808 | -1                  | -1        | NO
+ regression       | public          | sequence_test5     | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | sequence_test6     | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | sequence_test7     | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+ regression       | public          | sequence_test8     | smallint  |                16 |                       2 |             0 | 1           | 1                    | 20000               | 1         | NO
+ regression       | public          | sequence_test9     | smallint  |                16 |                       2 |             0 | -1          | -32768               | -1                  | -1        | NO
+ regression       | public          | serialtest1_f2_foo | integer   |                32 |                       2 |             0 | 1           | 1                    | 2147483647          | 1         | NO
+ regression       | public          | serialtest2_f2_seq | integer   |                32 |                       2 |             0 | 1           | 1                    | 2147483647          | 1         | NO
+ regression       | public          | serialtest2_f3_seq | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | serialtest2_f4_seq | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | serialtest2_f5_seq | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+ regression       | public          | serialtest2_f6_seq | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+(19 rows)
+
+SELECT schemaname, sequencename, start_value, min_value, max_value, increment_by, cycle, cache_size, last_value
+FROM pg_sequences
+WHERE sequencename ~ ANY(ARRAY['sequence_test', 'serialtest'])
+  ORDER BY sequencename ASC;
+ schemaname |    sequencename    | start_value |      min_value       |      max_value      | increment_by | cycle | cache_size | last_value 
+------------+--------------------+-------------+----------------------+---------------------+--------------+-------+------------+------------
+ public     | sequence_test10    |           1 |               -20000 |               32767 |            1 | f     |          1 |           
+ public     | sequence_test11    |           1 |                    1 |          2147483647 |            1 | f     |          1 |           
+ public     | sequence_test12    |          -1 |          -2147483648 |                  -1 |           -1 | f     |          1 |           
+ public     | sequence_test13    |      -32768 |          -2147483648 |          2147483647 |            1 | f     |          1 |           
+ public     | sequence_test14    |       32767 |          -2147483648 |          2147483647 |           -1 | f     |          1 |           
+ public     | sequence_test2     |          32 |                    5 |                  36 |            4 | t     |          1 |          5
+ public     | sequence_test3     |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |           
+ public     | sequence_test4     |          -1 | -9223372036854775808 |                  -1 |           -1 | f     |          1 |         -1
+ public     | sequence_test5     |           1 |                    1 |               32767 |            1 | f     |          1 |           
+ public     | sequence_test6     |           1 |                    1 |               32767 |            1 | f     |          1 |           
+ public     | sequence_test7     |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |           
+ public     | sequence_test8     |           1 |                    1 |               20000 |            1 | f     |          1 |           
+ public     | sequence_test9     |          -1 |               -32768 |                  -1 |           -1 | f     |          1 |           
+ public     | serialtest1_f2_foo |           1 |                    1 |          2147483647 |            1 | f     |          1 |          3
+ public     | serialtest2_f2_seq |           1 |                    1 |          2147483647 |            1 | f     |          1 |          2
+ public     | serialtest2_f3_seq |           1 |                    1 |               32767 |            1 | f     |          1 |          2
+ public     | serialtest2_f4_seq |           1 |                    1 |               32767 |            1 | f     |          1 |          2
+ public     | serialtest2_f5_seq |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |          2
+ public     | serialtest2_f6_seq |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |          2
+(19 rows)
+
+SELECT * FROM pg_sequence_parameters('sequence_test4'::regclass);
+ start_value |    minimum_value     | maximum_value | increment | cycle_option | cache_size | data_type 
+-------------+----------------------+---------------+-----------+--------------+------------+-----------
+          -1 | -9223372036854775808 |            -1 |        -1 | f            |          1 |        20
+(1 row)
+
+\d sequence_test4
+                       Sequence "public.sequence_test4"
+  Type  | Start |       Minimum        | Maximum | Increment | Cycles? | Cache 
+--------+-------+----------------------+---------+-----------+---------+-------
+ bigint |    -1 | -9223372036854775808 |      -1 |        -1 | no      |     1
+
+\d serialtest2_f2_seq
+                 Sequence "public.serialtest2_f2_seq"
+  Type   | Start | Minimum |  Maximum   | Increment | Cycles? | Cache 
+---------+-------+---------+------------+-----------+---------+-------
+ integer |     1 |       1 | 2147483647 |         1 | no      |     1
+Owned by: public.serialtest2.f2
+
+-- Test comments
+COMMENT ON SEQUENCE asdf IS 'won''t work';
+ERROR:  relation "asdf" does not exist
+COMMENT ON SEQUENCE sequence_test2 IS 'will work';
+COMMENT ON SEQUENCE sequence_test2 IS NULL;
+-- Test lastval()
+CREATE SEQUENCE seq;
+SELECT nextval('seq');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+SELECT setval('seq', 99);
+ setval 
+--------
+     99
+(1 row)
+
+SELECT lastval();
+ lastval 
+---------
+      99
+(1 row)
+
+DISCARD SEQUENCES;
+SELECT lastval();
+ERROR:  lastval is not yet defined in this session
+CREATE SEQUENCE seq2;
+SELECT nextval('seq2');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+DROP SEQUENCE seq2;
+-- should fail
+SELECT lastval();
+ERROR:  lastval is not yet defined in this session
+-- Test sequences in read-only transactions
+CREATE TEMPORARY SEQUENCE sequence_test_temp1;
+START TRANSACTION READ ONLY;
+SELECT nextval('sequence_test_temp1');  -- ok
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('sequence_test2');  -- error
+ERROR:  cannot execute nextval() in a read-only transaction
+ROLLBACK;
+START TRANSACTION READ ONLY;
+SELECT setval('sequence_test_temp1', 1);  -- ok
+ setval 
+--------
+      1
+(1 row)
+
+SELECT setval('sequence_test2', 1);  -- error
+ERROR:  cannot execute setval() in a read-only transaction
+ROLLBACK;
+-- privileges tests
+CREATE USER regress_seq_user;
+-- nextval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT SELECT ON seq3 TO regress_seq_user;
+SELECT nextval('seq3');
+ERROR:  permission denied for sequence seq3
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT USAGE ON seq3 TO regress_seq_user;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+-- currval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT SELECT ON seq3 TO regress_seq_user;
+SELECT currval('seq3');
+ currval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT currval('seq3');
+ERROR:  permission denied for sequence seq3
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT USAGE ON seq3 TO regress_seq_user;
+SELECT currval('seq3');
+ currval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+-- lastval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT SELECT ON seq3 TO regress_seq_user;
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT lastval();
+ERROR:  permission denied for sequence seq3
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT USAGE ON seq3 TO regress_seq_user;
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+-- setval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+SAVEPOINT save;
+SELECT setval('seq3', 5);
+ERROR:  permission denied for sequence seq3
+ROLLBACK TO save;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT setval('seq3', 5);
+ setval 
+--------
+      5
+(1 row)
+
+SELECT nextval('seq3');
+ nextval 
+---------
+       6
+(1 row)
+
+ROLLBACK;
+-- ALTER SEQUENCE
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+ALTER SEQUENCE sequence_test2 START WITH 1;
+ERROR:  must be owner of sequence sequence_test2
+ROLLBACK;
+-- Sequences should get wiped out as well:
+DROP TABLE serialTest1, serialTest2;
+-- Make sure sequences are gone:
+SELECT * FROM information_schema.sequences WHERE sequence_name IN
+  ('sequence_test2', 'serialtest2_f2_seq', 'serialtest2_f3_seq',
+   'serialtest2_f4_seq', 'serialtest2_f5_seq', 'serialtest2_f6_seq')
+  ORDER BY sequence_name ASC;
+ sequence_catalog | sequence_schema | sequence_name  | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value | minimum_value | maximum_value | increment | cycle_option 
+------------------+-----------------+----------------+-----------+-------------------+-------------------------+---------------+-------------+---------------+---------------+-----------+--------------
+ regression       | public          | sequence_test2 | bigint    |                64 |                       2 |             0 | 32          | 5             | 36            | 4         | YES
+(1 row)
+
+DROP USER regress_seq_user;
+DROP SEQUENCE seq;
+-- cache tests
+CREATE SEQUENCE test_seq1 CACHE 10;
+SELECT nextval('test_seq1');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('test_seq1');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('test_seq1');
+ nextval 
+---------
+       3
+(1 row)
+
+DROP SEQUENCE test_seq1;

From 5718c7cddcbdcce8d797710ca26f3f161375d73d Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Tue, 18 May 2021 12:24:06 +0300
Subject: [PATCH 017/214] Bring back change that got lost in refactoring.
 silence ReadBuffer_common error. TODO: add a comment, why this is fine for
 zenith.

---
 src/backend/storage/buffer/bufmgr.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8b605774464..1e73dbbcc58 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -929,11 +929,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 		if (!PageIsNew((Page) bufBlock))
-			ereport(ERROR,
+		{
+			 // XXX-ZENITH
+			 MemSet((char *) bufBlock, 0, BLCKSZ);
+			 ereport(DEBUG1,
 					(errmsg("unexpected data beyond EOF in block %u of relation %s",
 							blockNum, relpath(smgr->smgr_rnode, forkNum)),
 					 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
-
+		}
 		/*
 		 * We *must* do smgrextend before succeeding, else the page will not
 		 * be reserved by the kernel, and the next P_NEW call will decide to

From 2fd3875ecb40a61a6021c4c1b8be4d031984c81d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 7 Jun 2021 16:00:42 +0300
Subject: [PATCH 018/214] [contrib/zenith] [refer #225] if insert WAL position
 points at the end of WAL page header, then return it back to the page origin

---
 contrib/zenith/pagestore_smgr.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 3a91d80b926..19e39ffeb74 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -16,6 +16,7 @@
 
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlog_internal.h"
 #include "pagestore_client.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
@@ -358,6 +359,29 @@ zenith_init(void)
 #endif
 }
 
+/*
+ * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position
+ * to physical position in WAL. It always adds SizeOfXLogShortPHD:
+ *		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ * so even if there are no records on the page, offset will be SizeOfXLogShortPHD.
+ * It may cause problems with XLogFlush. So return pointer backward to the origin of the page.
+ */
+static XLogRecPtr
+zm_adjust_lsn(XLogRecPtr lsn)
+{
+	/* If lsn points to the beging of first record on page or segment,
+	 * then "return" it back to the page origin
+	 */
+	if ((lsn & (XLOG_BLCKSZ-1)) == SizeOfXLogShortPHD)
+	{
+		lsn -= SizeOfXLogShortPHD;
+	}
+	else if ((lsn & (wal_segment_size-1)) == SizeOfXLogLongPHD)
+	{
+		lsn -= SizeOfXLogLongPHD;
+	}
+	return lsn;
+}
 
 /*
  * Return LSN for requesting pages and number of blocks from page server
@@ -388,7 +412,6 @@ zenith_get_request_lsn(bool nonrel)
 	}
 	else
 	{
-		lsn = GetLastWrittenPageLSN();
 		flushlsn = GetFlushRecPtr();
 
 		/*
@@ -412,6 +435,8 @@ zenith_get_request_lsn(bool nonrel)
 			elog(DEBUG1, "zenith_get_request_lsn GetFlushRecPtr lsn %X/%X",
 				 (uint32) ((lsn) >> 32), (uint32) (lsn));
 		}
+		else
+			lsn = zm_adjust_lsn(lsn);
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
@@ -858,6 +883,8 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	lsn = GetXLogInsertRecPtr();
 
+	lsn = zm_adjust_lsn(lsn);
+
 	/*
 	 * Flush it, too. We don't actually care about it here, but let's uphold
 	 * the invariant that last-written LSN <= flush LSN.

From c58bf01ba79cc5bcef13f6c13ea377822e67c963 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 8 Jun 2021 18:44:41 +0300
Subject: [PATCH 019/214] [walproposer] Create replication slot for walproposer
 to avoid loose of WAL at compute node + Check for presence of replication
 slot

---
 src/backend/replication/walproposer.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 564defc024a..d71061765b8 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -11,6 +11,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "access/xlog.h"
+#include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
@@ -24,6 +25,8 @@ char* wal_acceptors_list;
 int   wal_acceptor_reconnect_timeout;
 bool  am_wal_proposer;
 
+#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
+
 static int          n_walkeepers = 0;
 static int          quorum = 0;
 static WalKeeper    walkeeper[MAX_WALKEEPERS];
@@ -293,6 +296,13 @@ WalProposerMain(Datum main_arg)
 	InitWalSender();
 	ResetWalProposerEventSet();
 
+	/* Create replication slot for WAL proposer if not exists */
+	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
+	{
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotRelease();
+	}
+
 	/* Initiate connections to all walkeeper nodes */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -312,7 +322,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	 */
 	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
 
-	cmd.slotname = NULL;
+	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;
 	StartReplication(&cmd);
@@ -535,8 +545,8 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 
-	sprintf(conninfo, "host=%s port=%s dbname=replication",
-			walkeeper[leader].host, walkeeper[leader].port);
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
+			walkeeper[leader].host, walkeeper[leader].port, zenith_timeline_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{

From e7ef05c5beeb61f03634b28eb0eae3bd69bc59e3 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Tue, 17 Aug 2021 20:12:31 +0300
Subject: [PATCH 020/214] [walproposer] Skip absent WAL segment removed by
 pg_resetwal

---
 src/backend/replication/walproposer.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d71061765b8..857cef5deb7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -313,6 +313,16 @@ WalProposerMain(Datum main_arg)
 		WalProposerPoll();
 }
 
+static bool
+WalSegmentExists(XLogRecPtr startpos)
+{
+	char path[MAXPGPATH];
+	XLogSegNo segNo;
+	XLByteToSeg(startpos, segNo, serverInfo.walSegSize);
+	XLogFilePath(path, serverInfo.timeline, segNo, serverInfo.walSegSize);
+	return access(path, F_OK) == 0;
+}
+
 static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
@@ -322,6 +332,14 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	 */
 	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
 
+	/* Requested segment may not exists because we generate new segment at node startup (aka pg_resetwal).
+	 * So just skip it.
+	 */
+	if (!WalSegmentExists(startpos) && WalSegmentExists(startpos + serverInfo.walSegSize))
+	{
+		elog(LOG, "Advance start position %llx to next segment", startpos);
+		startpos += serverInfo.walSegSize;
+	}
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;

From 9a4d5427ab9a8fb377c5f5021ca49115f5871c44 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 11 Jun 2021 16:39:25 +0300
Subject: [PATCH 021/214] [walproposer] Fix breaking out of WalProposerPoll and
 WaitEventSetWait inside.

WAL proposer (as bgw without BGWORKER_BACKEND_DATABASE_CONNECTION) previously
ignored SetLatch, so once caught up it stuck inside WalProposerPoll infinitely.

Futher, WaitEventSetWait didn't have timeout, so we didn't try to reconnect if
all connections are dead as well. Fix that.

Also move break on latch set to the end of the loop to attempt
ReconnectWalKeepers even if latch is constantly set.

Per test_race_conditions (Python version now).
---
 src/backend/replication/walproposer.c | 50 +++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 857cef5deb7..410c0cab579 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -233,6 +233,7 @@ WalProposerMain(Datum main_arg)
 	char* port;
 
 	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
 
@@ -535,13 +536,33 @@ StartElection(void)
 	prop.epoch += 1;
 }
 
+/*
+ * How much milliseconds left till we should attempt reconnection to
+ * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
+ * (do we actually need this?).
+ */
+static long
+TimeToReconnect(TimestampTz now)
+{
+	TimestampTz passed;
+	TimestampTz till_reconnect;
+
+	if (wal_acceptor_reconnect_timeout <= 0)
+		return -1;
+
+	passed = now - last_reconnect_attempt;
+	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
+	if (till_reconnect <= 0)
+		return 0;
+	return (long) (till_reconnect / 1000);
+}
 
 static void
 ReconnectWalKeepers(void)
 {
 	/* Initiate reconnect if timeout is expired */
 	TimestampTz now = GetCurrentTimestamp();
-	if (wal_acceptor_reconnect_timeout > 0 && now - last_reconnect_attempt > wal_acceptor_reconnect_timeout*1000)
+	if (TimeToReconnect(now) == 0)
 	{
 		last_reconnect_attempt = now;
 		for (int i = 0; i < n_walkeepers; i++)
@@ -633,23 +654,19 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	return true;
 }
 
+/* Advance the WAL proposer state machine. */
 void
 WalProposerPoll(void)
 {
 	while (true)
 	{
 		WaitEvent	event;
-		int rc = WaitEventSetWait(waitEvents, -1, &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		WalKeeper*  wk = (WalKeeper*)event.user_data;
+		TimestampTz now = GetCurrentTimestamp();
+		int rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+								  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		WalKeeper*  wk = (WalKeeper*) event.user_data;
 		int i = (int)(wk - walkeeper);
 
-		/* If wait is terminated by error, postmaster die or latch event, then exit loop */
-		if (rc <= 0 || (event.events & (WL_POSTMASTER_DEATH|WL_LATCH_SET)) != 0)
-		{
-			ResetLatch(MyLatch);
-			break;
-		}
-
 		/* communication with walkeepers */
 		if (event.events & WL_SOCKET_READABLE)
 		{
@@ -869,7 +886,20 @@ WalProposerPoll(void)
 					elog(FATAL, "Unexpected write state %d", wk->state);
 			}
 		}
+
 		ReconnectWalKeepers();
+
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (event.events & (WL_LATCH_SET) != 0)
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+
 	}
 }
 

From 423f73b25dea9eceeaa61883e9b5ddbea873dc13 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Jun 2021 18:03:41 +0300
Subject: [PATCH 022/214] [walproposer] Make it possible to start postgres
 without reading checkpoint from WAL

+ Check for presence of zenith.signal file to allow skip reading checkpoint record from WAL

+ Pass prev_record_ptr through zenith.signal file to postgres
---
 src/backend/access/transam/xlog.c     | 50 +++++++++++++++++++++++----
 src/backend/replication/walproposer.c | 18 ----------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1a13af5c2da..651e65b125a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6512,6 +6512,7 @@ StartupXLOG(void)
 	bool		reachedRecoveryTarget = false;
 	bool		haveBackupLabel = false;
 	bool		haveTblspcMap = false;
+	bool        skipLastRecordReread = false;
 	XLogRecPtr	RecPtr,
 				checkPointLoc,
 				EndOfLog;
@@ -7056,10 +7057,26 @@ StartupXLOG(void)
 
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 	doPageWrites = lastFullPageWrites;
-
 	if (RecPtr < checkPoint.redo)
-		ereport(PANIC,
-				(errmsg("invalid redo in checkpoint record")));
+	{
+		int fd = BasicOpenFile("zenith.signal", O_RDWR | PG_BINARY);
+		if (fd >= 0) {
+			XLogRecPtr prevRecPtr = 0;
+			if ((size_t)read(fd, &prevRecPtr, sizeof prevRecPtr) != sizeof(prevRecPtr)) {
+				elog(LOG, "can't read previous record position from zenith.signal file: %m");
+			}
+			LastRec = prevRecPtr;
+			/* Zenith hacks to spawn compute node without WAL */
+			EndRecPtr = RecPtr = checkPoint.redo;
+			skipLastRecordReread = true;
+			close(fd);
+		}
+		else
+		{
+			ereport(PANIC,
+					(errmsg("invalid redo in checkpoint record")));
+		}
+	}
 
 	/*
 	 * Check whether we need to force recovery from WAL.  If it appears to
@@ -7725,8 +7742,28 @@ StartupXLOG(void)
 	 * valid or last applied record, so we can identify the exact endpoint of
 	 * what we consider the valid portion of WAL.
 	 */
-	XLogBeginRead(xlogreader, LastRec);
-	record = ReadRecord(xlogreader, PANIC, false);
+
+	/*
+	 * We use the last WAL page to initialize the WAL for writing,
+	 * so we better have it in memory.
+	 */
+	if (skipLastRecordReread)
+	{
+		XLogRecPtr lastPage = EndRecPtr - (EndRecPtr % XLOG_BLCKSZ);
+		int idx = XLogRecPtrToBufIdx(lastPage);
+		XLogPageHeader xlogPageHdr = (XLogPageHeader)(XLogCtl->pages + idx*XLOG_BLCKSZ);
+		xlogPageHdr->xlp_pageaddr = lastPage;
+		xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+		readOff = XLogSegmentOffset(lastPage, wal_segment_size);
+		elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
+	}
+	else
+	{
+		XLogBeginRead(xlogreader, LastRec);
+		record = ReadRecord(xlogreader, PANIC, false);
+		if (!record)
+			elog(PANIC, "could not re-read last record");
+	}
 	EndOfLog = EndRecPtr;
 
 	/*
@@ -7921,7 +7958,8 @@ StartupXLOG(void)
 		/* Copy the valid part of the last block, and zero the rest */
 		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
 		len = EndOfLog % XLOG_BLCKSZ;
-		memcpy(page, xlogreader->readBuf, len);
+		if (!skipLastRecordReread)
+			memcpy(page, xlogreader->readBuf, len);
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
 		XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 410c0cab579..cbc68173ef5 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -314,16 +314,6 @@ WalProposerMain(Datum main_arg)
 		WalProposerPoll();
 }
 
-static bool
-WalSegmentExists(XLogRecPtr startpos)
-{
-	char path[MAXPGPATH];
-	XLogSegNo segNo;
-	XLByteToSeg(startpos, segNo, serverInfo.walSegSize);
-	XLogFilePath(path, serverInfo.timeline, segNo, serverInfo.walSegSize);
-	return access(path, F_OK) == 0;
-}
-
 static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
@@ -333,14 +323,6 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	 */
 	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
 
-	/* Requested segment may not exists because we generate new segment at node startup (aka pg_resetwal).
-	 * So just skip it.
-	 */
-	if (!WalSegmentExists(startpos) && WalSegmentExists(startpos + serverInfo.walSegSize))
-	{
-		elog(LOG, "Advance start position %llx to next segment", startpos);
-		startpos += serverInfo.walSegSize;
-	}
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;

From f459143ef25559fdd1b74710e19516879cd43381 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 5 Jul 2021 17:30:26 +0300
Subject: [PATCH 023/214] [walproposer] Simplify WL_LATCH_SET testing in the
 walproposer

---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index cbc68173ef5..e87b36287ce 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -876,7 +876,7 @@ WalProposerPoll(void)
 		 * each wal flush), then exit loop. (no need for pm death check due to
 		 * WL_EXIT_ON_PM_DEATH)
 		 */
-		if (event.events & (WL_LATCH_SET) != 0)
+		if (event.events & WL_LATCH_SET)
 		{
 			ResetLatch(MyLatch);
 			break;

From f6acb0aad5970667155dc567885b99f7c12ee46e Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Fri, 25 Jun 2021 00:12:18 +0300
Subject: [PATCH 024/214] [walredo] Add basic support for Seccomp BPF mode

This patch aims to make our bespoke WAL redo machinery more robust
in the presence of untrusted (in other words, possibly malicious) inputs.

Pageserver delegates complex WAL decoding duties to postgres,
which means that the latter might fall victim to carefully designed
malicious WAL records and start doing harmful things to the system.
To prevent this, it has been decided to limit possible interactions
with the outside world using the Secure Computing BPF mode.

We use this mode to disable all syscalls not in the allowlist.
Please refer to src/backend/postmaster/seccomp.c to learn more
about the pros & cons of the current approach.

+ Fix some bugs in seccomp bpf wrapper

* Use SCMP_ACT_TRAP instead of SCMP_ACT_KILL_PROCESS to receive signals.
* Add a missing variant of select() syscall (thx to @knizhnik).
* Write error messages to an fd stderr's currently pointing to.
---
 configure                          |  86 +++++++++++
 configure.ac                       |  13 ++
 src/Makefile.global.in             |   1 +
 src/backend/postmaster/Makefile    |   5 +
 src/backend/postmaster/seccomp.c   | 236 +++++++++++++++++++++++++++++
 src/backend/tcop/zenith_wal_redo.c |  77 +++++++++-
 src/include/pg_config.h.in         |   3 +
 src/include/postmaster/seccomp.h   |  26 ++++
 8 files changed, 443 insertions(+), 4 deletions(-)
 create mode 100644 src/backend/postmaster/seccomp.c
 create mode 100644 src/include/postmaster/seccomp.h

diff --git a/configure b/configure
index df63738bee1..c781dc19c3e 100755
--- a/configure
+++ b/configure
@@ -712,6 +712,7 @@ with_libxml
 with_uuid
 with_readline
 with_systemd
+with_libseccomp
 with_selinux
 with_ldap
 with_krb_srvnam
@@ -858,6 +859,7 @@ with_bsd_auth
 with_ldap
 with_bonjour
 with_selinux
+with_libseccomp
 with_systemd
 with_readline
 with_libedit_preferred
@@ -1564,6 +1566,7 @@ Optional Packages:
   --with-ldap             build with LDAP support
   --with-bonjour          build with Bonjour support
   --with-selinux          build with SELinux support
+  --with-libseccomp       build with libseccomp support
   --with-systemd          build with systemd support
   --without-readline      do not use GNU Readline nor BSD Libedit for editing
   --with-libedit-preferred
@@ -8606,6 +8609,39 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5
 $as_echo "$with_selinux" >&6; }
 
+#
+# libseccomp
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5
+$as_echo_n "checking whether to build with libseccomp support... " >&6; }
+
+
+
+# Check whether --with-libseccomp was given.
+if test "${with_libseccomp+set}" = set; then :
+  withval=$with_libseccomp;
+  case $withval in
+    yes)
+      :
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  with_libseccomp=no
+
+fi
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5
+$as_echo "$with_libseccomp" >&6; }
+
 #
 # Systemd
 #
@@ -14258,6 +14294,56 @@ else
 fi
 
 
+fi
+
+if test "$with_libseccomp" = yes ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5
+$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; }
+if ${ac_cv_lib_seccomp_seccomp_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lseccomp  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char seccomp_init ();
+int
+main ()
+{
+return seccomp_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_seccomp_seccomp_init=yes
+else
+  ac_cv_lib_seccomp_seccomp_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5
+$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; }
+if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBSECCOMP 1
+_ACEOF
+
+  LIBS="-lseccomp $LIBS"
+
+else
+  as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5
+fi
+
 fi
 
 # for contrib/uuid-ossp
diff --git a/configure.ac b/configure.ac
index 44b630c5e17..5cbbf838b00 100644
--- a/configure.ac
+++ b/configure.ac
@@ -914,6 +914,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support])
 AC_SUBST(with_selinux)
 AC_MSG_RESULT([$with_selinux])
 
+#
+# libseccomp
+#
+AC_MSG_CHECKING([whether to build with libseccomp support])
+PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support])
+AC_SUBST(with_libseccomp)
+AC_MSG_RESULT([$with_libseccomp])
+
 #
 # Systemd
 #
@@ -1567,6 +1575,11 @@ dnl If you want to use Apple's own Bonjour code on another platform,
 dnl just add -ldns_sd to LIBS manually.
 fi
 
+if test "$with_libseccomp" = yes ; then
+  AC_CHECK_LIB(seccomp, seccomp_init, [],
+               [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])])
+fi
+
 # for contrib/uuid-ossp
 if test "$with_uuid" = bsd ; then
   AC_CHECK_HEADERS(uuid.h,
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 0df9f13f4a1..342cc177995 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -186,6 +186,7 @@ with_tcl	= @with_tcl@
 with_ssl	= @with_ssl@
 with_readline	= @with_readline@
 with_selinux	= @with_selinux@
+with_libseccomp = @with_libseccomp@
 with_systemd	= @with_systemd@
 with_gssapi	= @with_gssapi@
 with_krb_srvnam	= @with_krb_srvnam@
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index bfdf6a833db..926ee077111 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -26,4 +26,9 @@ OBJS = \
 	syslogger.o \
 	walwriter.o
 
+ifeq ($(with_libseccomp),yes)
+OBJS += \
+	seccomp.o
+endif
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
new file mode 100644
index 00000000000..4ff34ebbd66
--- /dev/null
+++ b/src/backend/postmaster/seccomp.c
@@ -0,0 +1,236 @@
+/*-------------------------------------------------------------------------
+ *
+ * seccomp.c
+ *	  Secure Computing BPF API wrapper.
+ *
+ * Pageserver delegates complex WAL decoding duties to postgres,
+ * which means that the latter might fall victim to carefully designed
+ * malicious WAL records and start doing harmful things to the system.
+ * To prevent this, it has been decided to limit possible interactions
+ * with the outside world using the Secure Computing BPF mode.
+ *
+ * We use this mode to disable all syscalls not in the allowlist. This
+ * approach has its pros & cons:
+ *
+ *  - We have to carefully handpick and maintain the set of syscalls
+ *    required for the WAL redo process. Core dumps help with that.
+ *    The method of trial and error seems to work reasonably well,
+ *    but it would be nice to find a proper way to "prove" that
+ *    the set in question is both necessary and sufficient.
+ *
+ *  - Once we enter the seccomp bpf mode, it's impossible to lift those
+ *    restrictions (otherwise, what kind of "protection" would that be?).
+ *    Thus, we have to either enable extra syscalls for the clean shutdown,
+ *    or exit the process immediately via _exit() instead of proc_exit().
+ *
+ *  - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom
+ *    facility to deal with the forbidden syscalls? If we'd like to embed
+ *    a startup security test, we should go with the latter; In that
+ *    case, which one of the following options is preferable?
+ *
+ *      * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP.
+ *        Provide a common signal handler with a static switch to override
+ *        its behavior for the test case. This would undermine the whole
+ *        purpose of such protection, so we'd have to go further and remap
+ *        the memory backing the switch as readonly, then ban mprotect().
+ *        Ugly and fragile, to say the least.
+ *
+ *      * Yet again, catch the denied syscalls using SCMP_ACT_TRAP.
+ *        Provide 2 different signal handlers: one for a test case,
+ *        another for the main processing loop. Install the first one,
+ *        enable seccomp, perform the test, switch to the second one,
+ *        finally ban sigaction(), presto!
+ *
+ *      * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the
+ *        test, then ban it altogether with another filter. The downside
+ *        of this solution is that we don't actually check that
+ *        SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works.
+ *
+ *    Either approach seems to require two eBPF filter programs,
+ *    which is unfortunate: the man page tells this is uncommon.
+ *    Maybe I (@funbringer) am missing something, though; I encourage
+ *    any reader to get familiar with it and scrutinize my conclusions.
+ *
+ * TODOs and ideas in no particular order:
+ *
+ *  - Do something about mmap() in musl's malloc().
+ *    Definitely not a priority if we don't care about musl.
+ *
+ *  - See if we can untangle PG's shutdown sequence (involving unlink()):
+ *
+ *      * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode.
+ *      * Investigate chroot() or mount namespaces for better FS isolation.
+ *      * (Per Heikki) Simply call _exit(), no big deal.
+ *      * Come up with a better idea?
+ *
+ *  - Make use of seccomp's argument inspection (for what?).
+ *    Unfortunately, it views all syscall arguments as scalars,
+ *    so it won't work for e.g. string comparison in unlink().
+ *
+ *  - Benchmark with bpf jit on/off, try seccomp_syscall_priority().
+ *
+ *  - Test against various linux distros & glibc versions.
+ *    I suspect that certain libc functions might involve slightly
+ *    different syscalls, e.g. select/pselect6/pselect6_time64/whatever.
+ *
+ *  - Test on any arch other than amd64 to see if it works there.
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/seccomp.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+#include "postmaster/seccomp.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+static void die(int code, const char *str);
+
+static bool seccomp_test_sighandler_done = false;
+static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt);
+static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt);
+
+static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action);
+
+void seccomp_load_rules(PgSeccompRule *rules, int count)
+{
+#define raise_error(str) \
+	ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: " str)))
+
+	struct sigaction action = { .sa_flags = SA_SIGINFO };
+	PgSeccompRule rule;
+	long fd;
+
+	/*
+	 * Install a test signal handler.
+	 * XXX: pqsignal() is too restrictive for our purposes,
+	 * since we'd like to examine the contents of siginfo_t.
+	 */
+	action.sa_sigaction = seccomp_test_sighandler;
+	if (sigaction(SIGSYS, &action, NULL) != 0)
+		raise_error("failed to install a test SIGSYS handler");
+
+	/*
+	 * First, check that open of a well-known file works.
+	 * XXX: We use raw syscall() to call the very open().
+	 */
+	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	if (fd < 0 || seccomp_test_sighandler_done)
+		raise_error("failed to open a test file");
+	close((int)fd);
+
+	/* Set a trap on open() to test seccomp bpf */
+	rule = PG_SCMP(open, SCMP_ACT_TRAP);
+	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
+		raise_error("failed to load a test filter");
+
+	/* Finally, check that open() now raises SIGSYS */
+	(void)syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	if (!seccomp_test_sighandler_done)
+		raise_error("SIGSYS handler doesn't seem to work");
+
+	/* Now that everything seems to work, install a proper handler */
+	action.sa_sigaction = seccomp_deny_sighandler;
+	if (sigaction(SIGSYS, &action, NULL) != 0)
+		raise_error("failed to install a proper SIGSYS handler");
+
+	/* If this succeeds, any syscall not in the list will crash the process */
+	if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
+		raise_error("failed to enter seccomp mode");
+
+#undef raise_error
+}
+
+/*
+ * Enter seccomp mode with a BPF filter that will only allow
+ * certain syscalls to proceed.
+ */
+static int
+do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
+{
+	scmp_filter_ctx ctx;
+	int rc = -1;
+
+	/* Create a context with a default action for syscalls not in the list */
+	if ((ctx = seccomp_init(def_action)) == NULL)
+		goto cleanup;
+
+	for (int i = 0; i < count; i++)
+	{
+		PgSeccompRule *rule = &rules[i];
+		if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0)
+			goto cleanup;
+	}
+
+	/* Try building & loading the program into the kernel */
+	if ((rc = seccomp_load(ctx)) != 0)
+		goto cleanup;
+
+cleanup:
+	/*
+	 * We don't need the context anymore regardless of the result,
+	 * since either we failed or the eBPF program has already been
+	 * loaded into the linux kernel.
+	 */
+	seccomp_release(ctx);
+	return rc;
+}
+
+static void
+die(int code, const char *str)
+{
+	/* Best effort write to stderr */
+	(void)write(fileno(stderr), str, strlen(str));
+
+	/* XXX: we don't want to run any atexit callbacks */
+	_exit(code);
+}
+
+static void
+seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
+{
+#define DIE_PREFIX "seccomp test signal handler: "
+
+	/* Check that this signal handler is used only for a single test case */
+	if (seccomp_test_sighandler_done)
+		die(1, DIE_PREFIX "test handler should only be used for 1 test\n");
+	seccomp_test_sighandler_done = true;
+
+	if (signum != SIGSYS)
+		die(1, DIE_PREFIX "bad signal number\n");
+
+	/* TODO: maybe somehow extract the hardcoded syscall number */
+	if (info->si_syscall != SCMP_SYS(open))
+		die(1, DIE_PREFIX "bad syscall number\n");
+
+#undef DIE_PREFIX
+}
+
+static void
+seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
+{
+	/*
+	 * Unfortunately, we can't use seccomp_syscall_resolve_num_arch()
+	 * to resolve the syscall's name, since it calls strdup()
+	 * under the hood (wtf!).
+	 */
+	char buffer[128];
+	(void)snprintf(buffer, lengthof(buffer),
+			"---------------------------------------\n"
+			"seccomp: bad syscall %d\n"
+			"---------------------------------------\n",
+			info->si_syscall);
+
+	/*
+	 * Instead of silently crashing the process with
+	 * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS,
+	 * we'd like to receive a real SIGSYS to print the
+	 * message and *then* immediately exit.
+	 */
+	die(1, buffer);
+}
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 4503648fc3e..7e00a9e985d 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -62,6 +62,11 @@
 #include <sys/resource.h>
 #endif
 
+#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__)
+#define MALLOC_NO_MMAP
+#include <malloc.h>
+#endif
+
 #ifndef HAVE_GETRUSAGE
 #include "rusagestub.h"
 #endif
@@ -73,9 +78,10 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "postmaster/postmaster.h"
-#include "storage/ipc.h"
-#include "storage/bufmgr.h"
+#include "postmaster/seccomp.h"
 #include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"
@@ -93,6 +99,44 @@ static BufferTag target_redo_tag;
 
 #define TRACE DEBUG5
 
+#ifdef HAVE_LIBSECCOMP
+static void
+enter_seccomp_mode(void)
+{
+	PgSeccompRule syscalls[] =
+	{
+		/* Hard requirements */
+		PG_SCMP_ALLOW(exit_group),
+		PG_SCMP_ALLOW(pselect6),
+		PG_SCMP_ALLOW(read),
+		PG_SCMP_ALLOW(select),
+		PG_SCMP_ALLOW(write),
+
+		/* Memory allocation */
+		PG_SCMP_ALLOW(brk),
+#ifndef MALLOC_NO_MMAP
+		/* TODO: musl doesn't have mallopt */
+		PG_SCMP_ALLOW(mmap),
+		PG_SCMP_ALLOW(munmap),
+#endif
+
+		/* Enable those for a proper shutdown.
+		PG_SCMP_ALLOW(munmap),
+		PG_SCMP_ALLOW(shmctl),
+		PG_SCMP_ALLOW(shmdt),
+		PG_SCMP_ALLOW(unlink), // shm_unlink
+		*/
+	};
+
+#ifdef MALLOC_NO_MMAP
+	/* Ask glibc not to use mmap() */
+	mallopt(M_MMAP_MAX, 0);
+#endif
+
+	seccomp_load_rules(syscalls, lengthof(syscalls));
+}
+#endif
+
 /* ----------------------------------------------------------------
  * FIXME comment
  * PostgresMain
@@ -245,6 +289,22 @@ WalRedoMain(int argc, char *argv[],
 			RmgrTable[rmid].rm_startup();
 	}
 
+#ifdef HAVE_LIBSECCOMP
+	/* We prefer opt-out to opt-in for greater security */
+	bool enable_seccomp = true;
+	for (int i = 1; i < argc; i++)
+		if (strcmp(argv[i], "--disable-seccomp") == 0)
+			enable_seccomp = false;
+
+	/*
+	 * We deliberately delay the transition to the seccomp mode
+	 * until it's time to enter the main processing loop;
+	 * else we'd have to add a lot more syscalls to the allowlist.
+	 */
+	if (enable_seccomp)
+		enter_seccomp_mode();
+#endif
+
 	/*
 	 * Main processing loop
 	 */
@@ -289,6 +349,16 @@ WalRedoMain(int argc, char *argv[],
 				 */
 			case EOF:
 
+#ifdef HAVE_LIBSECCOMP
+				/*
+				 * Skip the shutdown sequence, leaving some garbage behind.
+				 * Hopefully, postgres will clean it up in the next run.
+				 * This way we don't have to enable extra syscalls, which is nice.
+				 * See enter_seccomp_mode() above.
+				 */
+				if (enable_seccomp)
+					_exit(0);
+#endif
 				/*
 				 * NOTE: if you are tempted to add more code here, DON'T!
 				 * Whatever you had in mind to do should be set up as an
@@ -636,8 +706,7 @@ GetPage(StringInfo input_message)
 	/* single thread, so don't bother locking the page */
 
 	/* Response: Page content */
-	fwrite(page, 1, BLCKSZ, stdout); /* FIXME: check errors */
-	fflush(stdout);
+	write(STDOUT_FILENO, page, BLCKSZ); /* FIXME: check errors */
 
 	ReleaseBuffer(buf);
 	DropDatabaseBuffers(rnode.dbNode);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 51fa911fb6a..c88ba837f46 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -339,6 +339,9 @@
 /* Define if you have a function readline library */
 #undef HAVE_LIBREADLINE
 
+/* Define to 1 if you have the `seccomp' library (-lseccomp). */
+#undef HAVE_LIBSECCOMP
+
 /* Define to 1 if you have the `selinux' library (-lselinux). */
 #undef HAVE_LIBSELINUX
 
diff --git a/src/include/postmaster/seccomp.h b/src/include/postmaster/seccomp.h
new file mode 100644
index 00000000000..1613d34bd47
--- /dev/null
+++ b/src/include/postmaster/seccomp.h
@@ -0,0 +1,26 @@
+#ifndef PG_SECCOMP_H
+#define PG_SECCOMP_H
+
+#include "postgres.h"
+
+#ifdef HAVE_LIBSECCOMP
+#include <seccomp.h>
+#endif
+
+typedef struct {
+    int    psr_syscall; /* syscall number */
+    uint32 psr_action;  /* libseccomp action, e.g. SCMP_ACT_ALLOW */
+} PgSeccompRule;
+
+#define PG_SCMP(syscall, action)                \
+    (PgSeccompRule) {                           \
+        .psr_syscall = SCMP_SYS(syscall),       \
+        .psr_action = (action),                 \
+    }
+
+#define PG_SCMP_ALLOW(syscall) \
+    PG_SCMP(syscall, SCMP_ACT_ALLOW)
+
+void seccomp_load_rules(PgSeccompRule *syscalls, int count);
+
+#endif /* PG_SECCOMP_H */

From 34b85e8bcf7ff589c2c826b15e0b9339aac66656 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 9 Jul 2021 11:56:20 +0300
Subject: [PATCH 025/214] [smgr_api] [contrib/zenith] 1. Do not call mdinit
 from smgrinit() because it cause memory leak in wal-redo-postgres 2. Add
 check for local relations to make it possible to use DEBUG_COMPARE_LOCAL mode
 in SMGR

+ Call smgr_init_standard from smgr_init_zenith
---
 contrib/zenith/pagestore_smgr.c | 29 ++++++++++++++++++++---------
 src/backend/storage/smgr/smgr.c |  7 ++-----
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 19e39ffeb74..29c5c46c0e3 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -44,6 +44,8 @@
 static char *hexdump_page(char *page);
 #endif
 
+#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
+
 const int SmgrTrace = DEBUG5;
 
 bool loaded = false;
@@ -492,7 +494,8 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 forkNum);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdcreate(reln, forkNum, isRedo);
+	if (IS_LOCAL_REL(reln))
+		mdcreate(reln, forkNum, isRedo);
 #endif
 }
 
@@ -548,7 +551,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdextend(reln, forkNum, blkno, buffer, skipFsync);
+	if (IS_LOCAL_REL(reln))
+		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
 }
 
@@ -562,7 +566,8 @@ zenith_open(SMgrRelation reln)
 	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdopen(reln);
+	if (IS_LOCAL_REL(reln))
+		mdopen(reln);
 #endif
 }
 
@@ -576,7 +581,8 @@ zenith_close(SMgrRelation reln, ForkNumber forknum)
 	elog(SmgrTrace, "[ZENITH_SMGR] close noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdclose(reln, forknum);
+	if (IS_LOCAL_REL(reln))
+		mdclose(reln, forknum);
 #endif
 }
 
@@ -605,7 +611,8 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdwriteback(reln, forknum, blocknum, nblocks);
+	if (IS_LOCAL_REL(reln))
+		mdwriteback(reln, forknum, blocknum, nblocks);
 #endif
 }
 
@@ -647,7 +654,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 
 
 #ifdef DEBUG_COMPARE_LOCAL
-	if (forkNum == MAIN_FORKNUM)
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
 		char pageserver_masked[BLCKSZ];
 		char mdbuf[BLCKSZ];
@@ -828,7 +835,8 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+	if (IS_LOCAL_REL(reln))
+		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 #endif
 }
 
@@ -894,7 +902,8 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	SetLastWrittenPageLSN(lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdtruncate(reln, forknum, nblocks);
+	if (IS_LOCAL_REL(reln))
+		mdtruncate(reln, forknum, nblocks);
 #endif
 }
 
@@ -915,7 +924,8 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
 	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdimmedsync(reln, forknum);
+	if (IS_LOCAL_REL(reln))
+		mdimmedsync(reln, forknum);
 #endif
 }
 
@@ -953,5 +963,6 @@ smgr_zenith(BackendId backend, RelFileNode rnode)
 void
 smgr_init_zenith(void)
 {
+	smgr_init_standard();
 	zenith_init();
 }
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index b455d07edce..8d2b6b73b29 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -69,10 +69,7 @@ static dlist_head unowned_relns;
 void
 smgrinit(void)
 {
-	if (smgr_init_hook)
-		(*smgr_init_hook)();
-
-	smgr_init_standard();
+	(*smgr_init_hook)();
 
 	/*
 	 * ZENITH XXX
@@ -98,7 +95,7 @@ smgrinit(void)
 
 /* Hook for plugins to get control in smgr */
 smgr_hook_type smgr_hook = NULL;
-smgr_init_hook_type smgr_init_hook = NULL;
+smgr_init_hook_type smgr_init_hook = smgr_init_standard;
 smgr_shutdown_hook_type smgr_shutdown_hook = NULL;
 
 const f_smgr *

From 7e2b41779e9608799e9d978a13a7b6ceaaa84994 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Thu, 15 Jul 2021 15:50:44 +0300
Subject: [PATCH 026/214] [walproposer] [contrib/zenith] support zenith_tenant

this patch adds support for zenith_tenant variable. it has similar
format as zenith_timeline. It is used in callmemaybe query to pass
tenant to pageserver and in ServerInfo structure passed to wal acceptor
---
 contrib/zenith/libpagestore.c         | 26 +++++++++++++++++++-------
 contrib/zenith/pagestore_client.h     |  1 +
 contrib/zenith/pagestore_smgr.c       |  1 +
 src/backend/replication/walproposer.c |  8 ++++++++
 src/include/replication/walproposer.h |  2 ++
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 062f0cbf2e0..89bfbe1906c 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -67,11 +67,13 @@ zenith_connect()
 	}
 
 	/* Ask the Page Server to connect to us, and stream WAL from us. */
-	if (callmemaybe_connstring && callmemaybe_connstring[0])
+	if (callmemaybe_connstring && callmemaybe_connstring[0] 
+		&& zenith_tenant
+		&& zenith_timeline)
 	{
 		PGresult   *res;
 
-		query = psprintf("callmemaybe %s %s", zenith_timeline, callmemaybe_connstring);
+		query = psprintf("callmemaybe %s %s %s", zenith_tenant, zenith_timeline, callmemaybe_connstring);
 		res = PQexec(pageserver_conn, query);
 		if (PQresultStatus(res) != PGRES_COMMAND_OK)
 		{
@@ -81,7 +83,7 @@ zenith_connect()
 		PQclear(res);
 	}
 
-	query = psprintf("pagestream %s", zenith_timeline);
+	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
 	ret = PQsendQuery(pageserver_conn, query);
 	if (ret != 1)
 		zenith_log(ERROR,
@@ -185,11 +187,11 @@ zenith_call(ZenithRequest request)
 
 
 static bool
-check_zenith_timeline(char **newval, void **extra, GucSource source)
+check_zenith_id(char **newval, void **extra, GucSource source)
 {
-	uint8		ztimelineid[16];
+	uint8		zid[16];
 
-	return **newval == '\0' || HexDecodeString(ztimelineid, *newval, 16);
+	return **newval == '\0' || HexDecodeString(zid, *newval, 16);
 }
 
 /*
@@ -223,7 +225,16 @@ _PG_init(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_zenith_timeline, NULL, NULL);
+							   check_zenith_id, NULL, NULL);
+
+	DefineCustomStringVariable("zenith.zenith_tenant",
+							   "Zenith tenantid the server is running on",
+							   NULL,
+							   &zenith_tenant,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_zenith_id, NULL, NULL);
 
 	DefineCustomBoolVariable("zenith.wal_redo",
 							 "start in wal-redo mode",
@@ -242,6 +253,7 @@ _PG_init(void)
 
 	/* Is there more correct way to pass CustomGUC to postgres code? */
 	zenith_timeline_walproposer = zenith_timeline;
+	zenith_tenant_walproposer = zenith_tenant;
 
 	if (wal_redo)
 	{
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 400fb259a6b..b4b223d3c46 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -90,6 +90,7 @@ extern page_server_api * page_server;
 extern char *page_server_connstring;
 extern char *callmemaybe_connstring;
 extern char *zenith_timeline;
+extern char *zenith_tenant;
 extern bool wal_redo;
 
 extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 29c5c46c0e3..858a67841ea 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -56,6 +56,7 @@ page_server_api *page_server;
 char *page_server_connstring;
 char *callmemaybe_connstring;
 char *zenith_timeline;
+char *zenith_tenant;
 bool wal_redo = false;
 
 char const *const ZenithMessageStr[] =
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index e87b36287ce..7a7996a1f82 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -221,6 +221,7 @@ HandleWalKeeperResponse(void)
 }
 
 char *zenith_timeline_walproposer = NULL;
+char *zenith_tenant_walproposer = NULL;
 
 /*
  * WAL proposer bgworeker entry point
@@ -285,6 +286,13 @@ WalProposerMain(Datum main_arg)
 	if (*zenith_timeline_walproposer != '\0' &&
 	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+	
+	if (!zenith_tenant_walproposer)
+		elog(FATAL, "zenith.zenith_tenant is not provided");
+	if (*zenith_tenant_walproposer != '\0' &&
+	 !HexDecodeString(serverInfo.ztenantid, zenith_tenant_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+
 	serverInfo.protocolVersion = SK_PROTOCOL_VERSION;
 	pg_strong_random(&serverInfo.nodeId.uuid, sizeof(serverInfo.nodeId.uuid));
 	serverInfo.systemId = GetSystemIdentifier();
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index e1845e3fb19..d770473ad35 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -23,6 +23,7 @@ struct WalMessage;
 typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
+extern char *zenith_tenant_walproposer;
 
 /* WAL safekeeper state */
 typedef enum
@@ -59,6 +60,7 @@ typedef struct ServerInfo
 	XLogRecPtr walEnd;
     TimeLineID timeline;
 	int        walSegSize;
+	uint8      ztenantid[16];
 } ServerInfo;
 
 /*

From 66fb5b49e1a85617666be39a4773c22bf7c7e736 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 4 Aug 2021 11:56:41 +0300
Subject: [PATCH 027/214] [walproposer] Remove graceful termination of COPY
 during walproposer recovery.

Rust's postgres_backend currently is too dummy to handle it properly: reading
happens in separate thread which just ignores CopyDone. Instead, writer thread
must get aware of termination and send CommandComplete. Also reading socket must
be transferred back to postgres_backend (or connection terminated completely
after COPY). Let's do that after more basic safkeeper refactoring and right now
cover this up to make tests pass.

ref #388
---
 src/backend/replication/walproposer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7a7996a1f82..240e3769a6a 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -612,7 +612,6 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 			if (rec_end_lsn >= endpos)
 				break;
 		}
-		walrcv_endstreaming(wrconn, &timeline);
 		walrcv_disconnect(wrconn);
 	}
 	else

From 616f45a1d069f95e9f30fe4dd7008167cf1a0311 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 9 Aug 2021 09:46:07 +0300
Subject: [PATCH 028/214] [walproposer] [contrib/zenith] [refer #395] Do no
 align sart replication position in wal_proppser to segment boundary

---
 contrib/zenith/pagestore_smgr.c       |   4 +
 src/backend/replication/walproposer.c | 359 +++++++++++++-------------
 2 files changed, 181 insertions(+), 182 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 858a67841ea..3d24cb79f5f 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -21,6 +21,7 @@
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 #include "access/xlogdefs.h"
+#include "postmaster/interrupt.h"
 #include "storage/bufmgr.h"
 #include "fmgr.h"
 #include "miscadmin.h"
@@ -243,6 +244,9 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 {
 	XLogRecPtr lsn = PageGetLSN(buffer);
 
+	if (ShutdownRequestPending)
+		return;
+
 	/*
 	 * If the page was not WAL-logged before eviction then we can lose its modification.
 	 * PD_WAL_LOGGED bit is used to mark pages which are wal-logged.
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 240e3769a6a..837b751a24c 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -326,11 +326,6 @@ static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
-	/*
-	 * Always start streaming at the beginning of a segment
-	 */
-	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
-
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;
@@ -656,226 +651,227 @@ WalProposerPoll(void)
 		WalKeeper*  wk = (WalKeeper*) event.user_data;
 		int i = (int)(wk - walkeeper);
 
-		/* communication with walkeepers */
-		if (event.events & WL_SOCKET_READABLE)
+		if (rc != 0)
 		{
-			switch (wk->state)
+			/* communication with walkeepers */
+			if (event.events & WL_SOCKET_READABLE)
 			{
-				case SS_HANDSHAKE:
-					/* Receive walkeeper node state */
-					rc = ReadSocketAsync(wk->sock,
-										 (char*)&wk->info + wk->asyncOffs,
-										 sizeof(wk->info) - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == sizeof(wk->info))
-					{
-						/* WalKeeper response completely received */
-
-						/* Check protocol version */
-						if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
+				switch (wk->state)
+				{
+					case SS_HANDSHAKE:
+						/* Receive walkeeper node state */
+						rc = ReadSocketAsync(wk->sock,
+											 (char*)&wk->info + wk->asyncOffs,
+											 sizeof(wk->info) - wk->asyncOffs);
+						if (rc < 0)
 						{
-							elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
-								wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
 							ResetConnection(i);
 						}
-						else
+						else if ((wk->asyncOffs += rc) == sizeof(wk->info))
 						{
-							wk->state = SS_VOTING;
-							wk->feedback.flushLsn = restartLsn;
-							wk->feedback.hs.ts = 0;
+							/* WalKeeper response completely received */
 
-							/* Check if we have quorum */
-							if (++n_connected >= quorum)
+							/* Check protocol version */
+							if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
 							{
-								if (n_connected == quorum)
-									StartElection();
+								elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
+									 wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
+								ResetConnection(i);
+							}
+							else
+							{
+								wk->state = SS_VOTING;
+								wk->feedback.flushLsn = restartLsn;
+								wk->feedback.hs.ts = 0;
 
-								/* Now send max-node-id to everyone participating in voting and wait their responses */
-								for (int j = 0; j < n_walkeepers; j++)
+								/* Check if we have quorum */
+								if (++n_connected >= quorum)
 								{
-									if (walkeeper[j].state == SS_VOTING)
+									if (n_connected == quorum)
+										StartElection();
+
+									/* Now send max-node-id to everyone participating in voting and wait their responses */
+									for (int j = 0; j < n_walkeepers; j++)
 									{
-										if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
-										{
-											ResetConnection(j);
-										}
-										else
+										if (walkeeper[j].state == SS_VOTING)
 										{
-											walkeeper[j].asyncOffs = 0;
-											walkeeper[j].state = SS_WAIT_VERDICT;
+											if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
+											{
+												ResetConnection(j);
+											}
+											else
+											{
+												walkeeper[j].asyncOffs = 0;
+												walkeeper[j].state = SS_WAIT_VERDICT;
+											}
 										}
 									}
 								}
 							}
 						}
-					}
-					break;
-
-				case SS_WAIT_VERDICT:
-					/* Receive walkeeper response for our candidate */
-					rc = ReadSocketAsync(wk->sock,
-										 (char*)&wk->info.server.nodeId + wk->asyncOffs,
-										 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
-					{
-						/* Response completely received */
-
-						/* If server accept our candidate, then it returns it in response */
-						if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+						break;
+
+					case SS_WAIT_VERDICT:
+						/* Receive walkeeper response for our candidate */
+						rc = ReadSocketAsync(wk->sock,
+											 (char*)&wk->info.server.nodeId + wk->asyncOffs,
+											 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
+						if (rc < 0)
 						{
-							elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-								wk->host, wk->port,
-								wk->info.server.nodeId.term, prop.nodeId.term);
+							ResetConnection(i);
 						}
-						else
+						else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
 						{
-							/* Handshake completed, do we have quorum? */
-							wk->state = SS_IDLE;
-							if (++n_votes == quorum)
-							{
-								elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
-									 quorum,
-									 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
-									);
+							/* Response completely received */
 
-								/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-								if (restartLsn != prop.VCL)
-								{
-									/* Perform recovery */
-									if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
-										elog(FATAL, "Failed to recover state");
-								}
-								WalProposerStartStreaming(prop.VCL);
-								/* Should not return here */
+							/* If server accept our candidate, then it returns it in response */
+							if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+							{
+								elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+									 wk->host, wk->port,
+									 wk->info.server.nodeId.term, prop.nodeId.term);
 							}
 							else
 							{
-								/* We are already streaming WAL: send all pending messages to the attached walkeeper */
-								SendMessageToNode(i, msgQueueHead);
+								/* Handshake completed, do we have quorum? */
+								wk->state = SS_IDLE;
+								if (++n_votes == quorum)
+								{
+									elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
+										 quorum,
+										 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
+										);
+
+									/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+									if (restartLsn != prop.VCL)
+									{
+										/* Perform recovery */
+										if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+											elog(FATAL, "Failed to recover state");
+									}
+									WalProposerStartStreaming(prop.VCL);
+									/* Should not return here */
+								}
+								else
+								{
+									/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+									SendMessageToNode(i, msgQueueHead);
+								}
 							}
 						}
-					}
-					break;
-
-			    case SS_RECV_FEEDBACK:
-					/* Read walkeeper response with flushed WAL position */
-				    rc = ReadSocketAsync(wk->sock,
-										 (char*)&wk->feedback + wk->asyncOffs,
-										 sizeof(wk->feedback) - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
-					{
-						WalMessage* next = wk->currMsg->next;
-						Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
-						wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
-						wk->state = SS_IDLE;
-						wk->asyncOffs = 0;
-						wk->currMsg = NULL;
-						HandleWalKeeperResponse();
-						SendMessageToNode(i, next);
-
-						/*
-						 * Also send the new VCL to all the walkeepers.
-						 *
-						 * FIXME: This is redundant for walkeepers that have other outbound messages
-						 * pending.
-						 */
-						if (true)
+						break;
+
+					case SS_RECV_FEEDBACK:
+						/* Read walkeeper response with flushed WAL position */
+						rc = ReadSocketAsync(wk->sock,
+											 (char*)&wk->feedback + wk->asyncOffs,
+											 sizeof(wk->feedback) - wk->asyncOffs);
+						if (rc < 0)
 						{
-							XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-							WalMessage *vclUpdateMsg;
-
-							if (minQuorumLsn > lastSentVCLLsn)
+							ResetConnection(i);
+						}
+						else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
+						{
+							WalMessage* next = wk->currMsg->next;
+							Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
+							wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+							wk->state = SS_IDLE;
+							wk->asyncOffs = 0;
+							wk->currMsg = NULL;
+							HandleWalKeeperResponse();
+							SendMessageToNode(i, next);
+
+							/*
+							 * Also send the new VCL to all the walkeepers.
+							 *
+							 * FIXME: This is redundant for walkeepers that have other outbound messages
+							 * pending.
+							 */
+							if (true)
 							{
-								vclUpdateMsg = CreateMessageVCLOnly();
-								if (vclUpdateMsg)
-									BroadcastMessage(vclUpdateMsg);
-								lastSentVCLLsn = minQuorumLsn;
+								XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+								WalMessage *vclUpdateMsg;
+
+								if (minQuorumLsn > lastSentVCLLsn)
+								{
+									vclUpdateMsg = CreateMessageVCLOnly();
+									if (vclUpdateMsg)
+										BroadcastMessage(vclUpdateMsg);
+									lastSentVCLLsn = minQuorumLsn;
+								}
 							}
 						}
-					}
-					break;
-
-				case SS_IDLE:
-					elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
-					ResetConnection(i);
-					break;
+						break;
+					case SS_IDLE:
+						elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
+						ResetConnection(i);
+						break;
 
-				default:
-		  			elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+					default:
+						elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+				}
 			}
-		}
-		else if (event.events & WL_SOCKET_WRITEABLE)
-		{
-			switch (wk->state)
+			else if (event.events & WL_SOCKET_WRITEABLE)
 			{
-				case SS_CONNECTING:
+				switch (wk->state)
 				{
-					int			optval = 0;
-					ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
-					if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
+					case SS_CONNECTING:
 					{
-						elog(WARNING, "Failed to connect to node '%s:%s': %s",
-							 wk->host, wk->port,
-							 strerror(optval));
-						closesocket(wk->sock);
-						wk->sock =  PGINVALID_SOCKET;
-						wk->state = SS_OFFLINE;
-						ResetWalProposerEventSet();
-					}
-					else
-					{
-						uint32 len = 0;
-						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-						/*
-						 * Start handshake: send information about server.
-						 * First of all send 0 as package size: it allows walkeeper to distinguish
-						 * wal_proposer's connection from standard replication connection from pagers.
-						 */
-						if (WriteSocket(wk->sock, &len, sizeof len)
-							&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
+						int			optval = 0;
+						ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
+						if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
 						{
-							wk->state = SS_HANDSHAKE;
-							wk->asyncOffs = 0;
+							elog(WARNING, "Failed to connect to node '%s:%s': %s",
+								 wk->host, wk->port,
+								 strerror(optval));
+							closesocket(wk->sock);
+							wk->sock =  PGINVALID_SOCKET;
+							wk->state = SS_OFFLINE;
+							ResetWalProposerEventSet();
 						}
 						else
 						{
-							ResetConnection(i);
+							uint32 len = 0;
+							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+							/*
+							 * Start handshake: send information about server.
+							 * First of all send 0 as package size: it allows walkeeper to distinguish
+							 * wal_proposer's connection from standard replication connection from pagers.
+							 */
+							if (WriteSocket(wk->sock, &len, sizeof len)
+								&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
+							{
+								wk->state = SS_HANDSHAKE;
+								wk->asyncOffs = 0;
+							}
+							else
+							{
+								ResetConnection(i);
+							}
 						}
+						break;
 					}
-					break;
-				}
 
-				case SS_SEND_WAL:
-					rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == wk->currMsg->size)
-					{
-						/* WAL block completely sent */
-						wk->state = SS_RECV_FEEDBACK;
-						wk->asyncOffs = 0;
-						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-					}
-					break;
+					case SS_SEND_WAL:
+						rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
+						if (rc < 0)
+						{
+							ResetConnection(i);
+						}
+						else if ((wk->asyncOffs += rc) == wk->currMsg->size)
+						{
+							/* WAL block completely sent */
+							wk->state = SS_RECV_FEEDBACK;
+							wk->asyncOffs = 0;
+							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+						}
+						break;
 
-				default:
-					elog(FATAL, "Unexpected write state %d", wk->state);
+					default:
+						elog(FATAL, "Unexpected write state %d", wk->state);
+				}
 			}
 		}
-
 		ReconnectWalKeepers();
 
 		/*
@@ -883,12 +879,11 @@ WalProposerPoll(void)
 		 * each wal flush), then exit loop. (no need for pm death check due to
 		 * WL_EXIT_ON_PM_DEATH)
 		 */
-		if (event.events & WL_LATCH_SET)
+		if (rc != 0 && (event.events & WL_LATCH_SET))
 		{
 			ResetLatch(MyLatch);
 			break;
 		}
-
 	}
 }
 

From 65ce50b208acab0fcc577b3339c29bac779211ef Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Tue, 10 Aug 2021 08:10:08 +0300
Subject: [PATCH 029/214] [test] Add contrib/zenith_test_utils with helpers for
 testing and debugging. Now it contains only one function test_consume_xids()
 for xid wraparound testing.

---
 contrib/zenith_test_utils/Makefile            | 22 ++++++++
 .../zenith_test_utils--1.0.sql                |  8 +++
 .../zenith_test_utils.control                 |  5 ++
 contrib/zenith_test_utils/zenithtest.c        | 50 +++++++++++++++++++
 4 files changed, 85 insertions(+)
 create mode 100644 contrib/zenith_test_utils/Makefile
 create mode 100644 contrib/zenith_test_utils/zenith_test_utils--1.0.sql
 create mode 100644 contrib/zenith_test_utils/zenith_test_utils.control
 create mode 100644 contrib/zenith_test_utils/zenithtest.c

diff --git a/contrib/zenith_test_utils/Makefile b/contrib/zenith_test_utils/Makefile
new file mode 100644
index 00000000000..9203f2349d3
--- /dev/null
+++ b/contrib/zenith_test_utils/Makefile
@@ -0,0 +1,22 @@
+# contrib/zenith_test_utils/Makefile
+
+
+MODULE_big = zenith_test_utils
+OBJS = \
+	$(WIN32RES) \
+	zenithtest.o
+
+EXTENSION = zenith_test_utils
+DATA = zenith_test_utils--1.0.sql
+PGFILEDESC = "zenith_test_utils - helpers for zenith testing and debugging"
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/zenith_test_utils
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
new file mode 100644
index 00000000000..6c8fe6521cf
--- /dev/null
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -0,0 +1,8 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION zenith_test_utils" to load this file. \quit
+
+CREATE FUNCTION test_consume_xids(nxids int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_xids'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenith_test_utils.control b/contrib/zenith_test_utils/zenith_test_utils.control
new file mode 100644
index 00000000000..9b947b63966
--- /dev/null
+++ b/contrib/zenith_test_utils/zenith_test_utils.control
@@ -0,0 +1,5 @@
+# zenith_test_utils extension
+comment = 'helpers for zenith testing and debugging'
+default_version = '1.0'
+module_pathname = '$libdir/zenith_test_utils'
+relocatable = true
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
new file mode 100644
index 00000000000..a7eb278a09b
--- /dev/null
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenithtest.c
+ *	  Helpers for zenith testing and debugging
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith_test_utils/zenithtest.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_consume_xids);
+
+/*
+ * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
+ */
+Datum
+test_consume_xids(PG_FUNCTION_ARGS)
+{
+	int32		nxids = PG_GETARG_INT32(0);
+	TransactionId topxid;
+	FullTransactionId fullxid;
+	TransactionId xid;
+	TransactionId targetxid;
+
+	/* make sure we have a top-XID first */
+	topxid = GetTopTransactionId();
+
+	xid = ReadNextTransactionId();
+
+	targetxid = xid + nxids;
+	while (targetxid < FirstNormalTransactionId)
+		targetxid++;
+
+	while (TransactionIdPrecedes(xid, targetxid))
+	{
+		fullxid = GetNewTransactionId(true);
+		xid = XidFromFullTransactionId(fullxid);
+		elog(DEBUG1, "topxid: %u xid: %u", topxid, xid);
+	}
+
+	PG_RETURN_VOID();
+}

From ce222a359446d95dbf2db04ed4d50f065ac71a5a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 11 Aug 2021 08:49:54 +0300
Subject: [PATCH 030/214] [walproposer] Change condition for triggering
 recovery

---
 src/backend/replication/walproposer.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 837b751a24c..a03ca9952a0 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -212,7 +212,10 @@ HandleWalKeeperResponse(void)
 		WalMessage* msg = msgQueueHead;
 		msgQueueHead = msg->next;
 		if (restartLsn < msg->req.beginLsn)
+		{
+			Assert(restartLsn < msg->req.endLsn);
 			restartLsn = msg->req.endLsn;
+		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(WalKeeperRequest));
 		free(msg);
 	}
@@ -326,6 +329,8 @@ static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;
@@ -357,9 +362,11 @@ SendMessageToNode(int i, WalMessage* msg)
 		msg->req.restartLsn = restartLsn;
 		msg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
-		elog(LOG, "sending message with len %ld VCL=%X/%X to %d",
-					msg->size - sizeof(WalKeeperRequest),
-					(uint32) (msg->req.commitLsn >> 32), (uint32) msg->req.commitLsn, i);
+		elog(LOG, "sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %d",
+			 msg->size - sizeof(WalKeeperRequest),
+			 LSN_FORMAT_ARGS(msg->req.commitLsn),
+			 LSN_FORMAT_ARGS(restartLsn),
+			 i);
 
 		rc = WriteSocketAsync(walkeeper[i].sock, &msg->req, msg->size);
 		if (rc < 0)
@@ -743,8 +750,10 @@ WalProposerPoll(void)
 										);
 
 									/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-									if (restartLsn != prop.VCL)
+									if (restartLsn < prop.VCL)
 									{
+										elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
+											 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
 										/* Perform recovery */
 										if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
 											elog(FATAL, "Failed to recover state");

From 1ef045c364aaca3e1e75bd4a9a0cabcf910aa5e4 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Thu, 5 Aug 2021 02:14:55 +0300
Subject: [PATCH 031/214] [contrib/zenith] Use authentication token passed as
 environment variable in connections to pageserver. Token is passed as
 cleartext password.

---
 contrib/zenith/libpagestore.c | 87 +++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 89bfbe1906c..142999a6a8e 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -50,10 +50,91 @@ page_server_api api = {
 static void
 zenith_connect()
 {
-	char	   *query;
-	int			ret;
+	char			 *query;
+	int				  ret;
+	char			 *auth_token;
+	char			 *err = NULL;
+	PQconninfoOption *conn_options;
+	PQconninfoOption *conn_option;
+	int 			 noptions = 0;
+
+    // this is heavily inspired by psql/command.c::do_connect
+	conn_options = PQconninfoParse(
+		page_server_connstring,
+	 	&err
+	);
+
+	if (conn_options == NULL) {
+		/* The error string is malloc'd, so we must free it explicitly */
+		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+		PQfreemem(err);
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+					errmsg("invalid connection string syntax: %s", errcopy)));
+	}
+
+	// Trying to populate pageserver connection string with auth token from environment.
+	// We are looking for password in with placeholder value like $ENV_VAR_NAME, so if password field is present 
+	// and starts with $ we try to fetch environment variable value and fail loudly if it is not set
+	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
+	{
+		noptions++;
+		if (strcmp(conn_option->keyword, "password") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+			{
+				// ensure that this is a template
+				if (strncmp(conn_option->val, "$", 1) != 0) {
+					ereport(
+						ERROR,
+						(
+							errcode(ERRCODE_CONNECTION_EXCEPTION),
+							errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])
+						)
+					);
+				}
+		
+				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
+				auth_token = getenv(&conn_option->val[1]);
+				if (!auth_token) {
+					ereport(
+						ERROR,
+						(
+							errcode(ERRCODE_CONNECTION_EXCEPTION),
+							errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])
+						)
+					);
+				} else {
+					zenith_log(LOG, "using auth token from environment passed via env");
+
+				// inspired by PQconninfoFree and conninfo_storeval
+				// so just free the old one and replace with freshly malloc'ed one
+				free(conn_option->val);
+				conn_option->val = strdup(auth_token);
+				}
+			}
+		}
+	}
+
+	// copy values from PQconninfoOption to key/value arrays because PQconnectdbParams accepts options this way
+	const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
+	const char **values = malloc((noptions + 1) * sizeof(*values));
+	int			 i = 0;
+	
+	for (i = 0; i < noptions; i++)
+	{
+		keywords[i] = conn_options[i].keyword;
+		values[i] = conn_options[i].val;
+	}
+	// add array terminator
+	keywords[i] = NULL;
+	values[i] = NULL;
+
+	pageserver_conn = PQconnectdbParams(keywords, values, false);
+	free(keywords);
+	free(values);
 
-	pageserver_conn = PQconnectdb(page_server_connstring);
+	PQconninfoFree(conn_options);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{

From 3bd780f45d1309c63e75b712e985607e0a16d227 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 13 Aug 2021 14:01:07 +0300
Subject: [PATCH 032/214] [contrib/zenith] Fix race condition while WAL-logging
 page, leading to CRC errors.

zenith_wallog_page() would call log_newpage() on a buffer, while holding
merely a shared lock on the page. That's not cool, because another backend
could modify the page concurrently. We allow changing hint bits while
holding only a shared lock, and changes on FSM pages, at least. See comments
in XLogSaveBufferForHint() for discussion of this problem.

One instance of the race condition that I was able to capture on my laptop
happened like this:

1. Backend A: needs to evict an FSM page from the buffer cache to make
   room for a new page, and calls zenith_wallog_page() on it. That is
   done while holding a share lock on the page.

2. Backend A: XLogInsertRecord() computes the CRC of the FPI WAL record
   including the FSM page

3. Backend B: Updates the same FSM page while holding only a share lock

4. Backend A: Allocates space in the WAL buffers, and copies the WAL
   record header and the page to the buffers.

At this point, the CRC that backend A computed earlier doesn't match the
contents that were written out to the WAL buffers.

The update of the FSM page in backend B happened from there (fsmpage.c):

	/*
	 * Update the next-target pointer. Note that we do this even if we're only
	 * holding a shared lock, on the grounds that it's better to use a shared
	 * lock and get a garbled next pointer every now and then, than take the
	 * concurrency hit of an exclusive lock.
	 *
	 * Wrap-around is handled at the beginning of this function.
	 */
	fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);

To fix, make a temporary copy of the page in zenith_wallog_page(), and
WAL-log that. Just like XLogSaveBufferForHint() does.

Fixes https://github.com/zenithdb/zenith/issues/413
---
 contrib/zenith/pagestore_smgr.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 3d24cb79f5f..47a37b0687d 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -238,6 +238,25 @@ zm_to_string(ZenithMessage *msg)
 	return s.data;
 }
 
+/*
+ * Wrapper around log_newpage() that makes a temporary copy of the block and
+ * WAL-logs that. This makes it safe to use while holding only a shared lock
+ * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint
+ * directly because it skips the logging if the LSN is new enough.
+ */
+static XLogRecPtr
+log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+				 Page page, bool page_std)
+{
+	PGAlignedBlock copied_buffer;
+
+	/* set the flag in the original page, like log_newpage() does. */
+	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
+
+	memcpy(copied_buffer.data, page, BLCKSZ);
+	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
+}
+
 
 static void
 zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
@@ -264,7 +283,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr recptr;
-		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
 		elog(SmgrTrace, "FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X",
@@ -284,7 +303,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 * Hopefully we do not evict actively used vm too often.
 		 */
 		XLogRecPtr recptr;
-		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
 
@@ -307,7 +326,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 * TODO Do we have any special page types?
 		 */
 
-		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
 
 		/* If we wal-log hint bits, someone could concurrently update page
 		 * and reset PD_WAL_LOGGED again, so this assert is not relevant anymore.

From 8c768c0a0bfd4edeefd0fb61cfce291b9f26f355 Mon Sep 17 00:00:00 2001
From: Max Sharnoff <github@max.sharnoff.org>
Date: Fri, 13 Aug 2021 11:23:16 -0700
Subject: [PATCH 033/214] [walproposer] Rework walkeeper protocol to use libpq
 (#60)

The majority of work here is going to be heavily cleaned up soon, but
it's worth giving a brief overview of the changes either way.

* Adds libpqwalproposer, serving a similar function to the existing
  libpqwalreceiver -- to provide access to libpq functions without
  causing problems from directly linking them.

* Adds two new state components, giving (a) the type of libpq-specific
  polling required to move on to the next protocol state and (b) the
  kind of socket events it's waiting on. (These are expected to be
  removed or heavily reworked soon.)

* Changes `WalProposerPoll` to make use of a slightly more specialized
  `AdvancePollState`, which has been completely reworked.
---
 src/Makefile                                  |    1 +
 .../replication/libpqwalproposer/Makefile     |   37 +
 .../libpqwalproposer/libpqwalproposer.c       |  327 +++++
 src/backend/replication/walproposer.c         | 1169 ++++++++++++-----
 src/backend/replication/walproposer_utils.c   |  285 ++--
 src/include/replication/walproposer.h         |  422 +++++-
 6 files changed, 1783 insertions(+), 458 deletions(-)
 create mode 100644 src/backend/replication/libpqwalproposer/Makefile
 create mode 100644 src/backend/replication/libpqwalproposer/libpqwalproposer.c

diff --git a/src/Makefile b/src/Makefile
index 79e274a4769..2f32e3d5137 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -22,6 +22,7 @@ SUBDIRS = \
 	include \
 	interfaces \
 	backend/replication/libpqwalreceiver \
+	backend/replication/libpqwalproposer \
 	backend/replication/pgoutput \
 	fe_utils \
 	bin \
diff --git a/src/backend/replication/libpqwalproposer/Makefile b/src/backend/replication/libpqwalproposer/Makefile
new file mode 100644
index 00000000000..c570160536f
--- /dev/null
+++ b/src/backend/replication/libpqwalproposer/Makefile
@@ -0,0 +1,37 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for src/backend/replication/libpqwalproposer
+#
+# IDENTIFICATION
+#    src/backend/replication/libpqwalproposer/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/libpqwalproposer
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS)
+
+OBJS = \
+	$(WIN32RES) \
+	libpqwalproposer.o
+SHLIB_LINK_INTERNAL = $(libpq)
+SHLIB_LINK = $(filter -lintl, $(LIBS))
+SHLIB_PREREQS = submake-libpq
+PGFILEDESC = "libpqwalproposer - libpq interface for WAL proposer"
+NAME = libpqwalproposer
+
+all: all-shared-lib
+
+include $(top_srcdir)/src/Makefile.shlib
+
+install: all installdirs install-lib
+
+installdirs: installdirs-lib
+
+uninstall: uninstall-lib
+
+clean distclean maintainer-clean: clean-lib
+	rm -f $(OBJS)
diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
new file mode 100644
index 00000000000..a5d7fec1a33
--- /dev/null
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -0,0 +1,327 @@
+#include "replication/walproposer.h"
+#include "libpq-fe.h"
+
+/* Required for anything that's dynamically loaded */
+PG_MODULE_MAGIC;
+void _PG_init(void);
+
+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn* pg_conn;
+};
+
+/* Prototypes for exported functions */
+static char*							libpqprop_error_message(WalProposerConn* conn);
+static WalProposerConnStatusType		libpqprop_status(WalProposerConn* conn);
+static WalProposerConn*					libpqprop_connect_start(char* conninfo);
+static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn* conn);
+static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
+static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
+static int								libpqprop_set_nonblocking(WalProposerConn* conn, int arg);
+static pgsocket							libpqprop_socket(WalProposerConn* conn);
+static int								libpqprop_flush(WalProposerConn* conn);
+static int								libpqprop_consume_input(WalProposerConn* conn);
+static void								libpqprop_finish(WalProposerConn* conn);
+static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
+static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
+
+static WalProposerFunctionsType PQWalProposerFunctions = {
+	libpqprop_error_message,
+	libpqprop_status,
+	libpqprop_connect_start,
+	libpqprop_connect_poll,
+	libpqprop_send_query,
+	libpqprop_get_query_result,
+	libpqprop_set_nonblocking,
+	libpqprop_socket,
+	libpqprop_flush,
+	libpqprop_consume_input,
+	libpqprop_finish,
+	libpqprop_async_read,
+	libpqprop_async_write,
+};
+
+/* Module initialization */
+void
+_PG_init(void)
+{
+	if (WalProposerFunctions != NULL)
+		elog(ERROR, "libpqwalproposer already loaded");
+	WalProposerFunctions = &PQWalProposerFunctions;
+}
+
+/* Exported function definitions */
+static char*
+libpqprop_error_message(WalProposerConn* conn)
+{
+	return PQerrorMessage(conn->pg_conn);
+}
+
+static WalProposerConnStatusType
+libpqprop_status(WalProposerConn* conn)
+{
+	switch (PQstatus(conn->pg_conn))
+	{
+		case CONNECTION_OK:
+			return WP_CONNECTION_OK;
+		case CONNECTION_BAD:
+			return WP_CONNECTION_BAD;
+		default:
+			return WP_CONNECTION_IN_PROGRESS;
+	}
+}
+
+static WalProposerConn*
+libpqprop_connect_start(char* conninfo)
+{
+	WalProposerConn*	conn;
+	PGconn*				pg_conn;
+
+	pg_conn = PQconnectStart(conninfo);
+	/*
+	 * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the
+	 * behavior of PQconnectStart here.
+	 */
+	if (!pg_conn)
+		return NULL;
+
+	/*
+	 * And in theory this allocation can fail as well, but it's incredibly unlikely if we just
+	 * successfully allocated a PGconn.
+	 *
+	 * palloc will exit on failure though, so there's not much we could do if it *did* fail.
+	 */
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	return conn;
+}
+
+static WalProposerConnectPollStatusType
+libpqprop_connect_poll(WalProposerConn* conn)
+{
+	WalProposerConnectPollStatusType return_val;
+
+	switch (PQconnectPoll(conn->pg_conn))
+	{
+		case PGRES_POLLING_FAILED:
+			return_val = WP_CONN_POLLING_FAILED;
+			break;
+		case PGRES_POLLING_READING:
+			return_val = WP_CONN_POLLING_READING;
+			break;
+		case PGRES_POLLING_WRITING:
+			return_val = WP_CONN_POLLING_WRITING;
+			break;
+		case PGRES_POLLING_OK:
+			return_val = WP_CONN_POLLING_OK;
+			break;
+
+		/* There's a comment at its source about this constant being unused. We'll expect it's never
+		 * returned. */
+		case PGRES_POLLING_ACTIVE:
+			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			/* This return is never actually reached, but it's here to make the compiler happy */
+			return WP_CONN_POLLING_FAILED;
+	}
+
+	return return_val;
+}
+
+static bool
+libpqprop_send_query(WalProposerConn* conn, char* query)
+{
+	int  result;
+	bool return_val;
+
+	switch ((result = PQsendQuery(conn->pg_conn, query)))
+	{
+		case 0:
+			return_val = false;
+			break;
+		case 1:
+			return_val = true;
+			break;
+		default:
+			elog(FATAL, "unexpected return %d from PQsendQuery", result);
+	}
+
+	return return_val;
+}
+
+static WalProposerExecStatusType
+libpqprop_get_query_result(WalProposerConn* conn)
+{
+	PGresult* result;
+	WalProposerExecStatusType return_val;
+
+	/* Marker variable if we need to log an unexpected success result */
+	char* unexpected_success = NULL;
+
+	if (PQisBusy(conn->pg_conn))
+		return WP_EXEC_NEEDS_INPUT;
+
+
+	result = PQgetResult(conn->pg_conn);
+	/* PQgetResult returns NULL only if getting the result was successful & there's no more of the
+	 * result to get. */
+	if (!result)
+	{
+		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		return WP_EXEC_UNEXPECTED_SUCCESS;
+	}
+
+	/* Helper macro to reduce boilerplate */
+	#define UNEXPECTED_SUCCESS(msg) \
+		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
+		unexpected_success = msg; \
+		break;
+
+
+	switch (PQresultStatus(result))
+	{
+		/* "true" success case */
+		case PGRES_COPY_BOTH:
+			return_val = WP_EXEC_SUCCESS_COPYBOTH;
+			break;
+
+		/* Unexpected success case */
+		case PGRES_EMPTY_QUERY:
+			UNEXPECTED_SUCCESS("empty query return");
+		case PGRES_COMMAND_OK:
+			UNEXPECTED_SUCCESS("data-less command end");
+		case PGRES_TUPLES_OK:
+			UNEXPECTED_SUCCESS("tuples return");
+		case PGRES_COPY_OUT:
+			UNEXPECTED_SUCCESS("'Copy Out' response");
+		case PGRES_COPY_IN:
+			UNEXPECTED_SUCCESS("'Copy In' response");
+		case PGRES_SINGLE_TUPLE:
+			UNEXPECTED_SUCCESS("single tuple return");
+		case PGRES_PIPELINE_SYNC:
+			UNEXPECTED_SUCCESS("pipeline sync point");
+
+		/* Failure cases */
+		case PGRES_BAD_RESPONSE:
+		case PGRES_NONFATAL_ERROR:
+		case PGRES_FATAL_ERROR:
+		case PGRES_PIPELINE_ABORTED:
+			return_val = WP_EXEC_FAILED;
+			break;
+	}
+
+	if (unexpected_success)
+		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+
+	return return_val;
+}
+
+static int
+libpqprop_set_nonblocking(WalProposerConn* conn, int arg)
+{
+	return PQsetnonblocking(conn->pg_conn, arg);
+}
+
+static pgsocket
+libpqprop_socket(WalProposerConn* conn)
+{
+	return PQsocket(conn->pg_conn);
+}
+
+static int
+libpqprop_flush(WalProposerConn* conn)
+{
+	return (PQflush(conn->pg_conn));
+}
+
+static int
+libpqprop_consume_input(WalProposerConn* conn)
+{
+	return (PQconsumeInput(conn->pg_conn));
+}
+
+static void
+libpqprop_finish(WalProposerConn* conn)
+{
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+static PGAsyncReadResult
+libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
+{
+	int result;
+
+	/* The docs for PQgetCopyData list the return values as:
+	 *      0 if the copy is still in progress, but no "complete row" is
+	 *        available
+	 *     -1 if the copy is done
+	 *     -2 if an error occured
+	 *  (> 0) if it was successful; that value is the amount transferred.
+	 *
+	 * The protocol we use between walproposer and walkeeper means that we
+	 * (i.e. walproposer) won't ever receive a message saying that the copy
+	 * is done. */
+	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
+	{
+		case 0:
+			return PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN;
+		case -1:
+			/* As mentioned above; this shouldn't happen */
+			elog(FATAL, "unexpected return -1 from PQgetCopyData");
+			break;
+		case -2:
+			return PG_ASYNC_READ_FAIL;
+		default:
+			/* Positive values indicate the size of the returned result */
+			*amount = result;
+			return PG_ASYNC_READ_SUCCESS;
+	}
+}
+
+static PGAsyncWriteResult
+libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
+{
+	int result;
+
+	/* The docs for PQputcopyData list the return values as:
+	 *   1 if the data was queued,
+	 *   0 if it was not queued because of full buffers, or
+	 *  -1 if an error occured
+	 */
+	switch (result = PQputCopyData(conn->pg_conn, buf, size))
+	{
+		case 1:
+			/* good -- continue */
+			break;
+		case 0:
+			/* FIXME: can this ever happen? the structure of walproposer
+			 * should always empty the connection's buffers before trying
+			 * to send more, right? */
+			return PG_ASYNC_WRITE_WOULDBLOCK;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQputCopyData", result);
+	}
+
+	/* After queueing the data, we still need to flush to get it to send.
+	 * This might take multiple tries, but we don't want to wait around
+	 * until it's done.
+	 *
+	 * PQflush has the following returns (directly quoting the docs):
+	 *   0 if sucessful,
+	 *   1 if it was unable to send all the data in the send queue yet
+	 *  -1 if it failed for some reason
+	 */
+	switch (result = PQflush(conn->pg_conn)) {
+		case 0:
+			return PG_ASYNC_WRITE_SUCCESS;
+		case 1:
+			return PG_ASYNC_WRITE_TRY_FLUSH;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQflush", result);
+	}
+}
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index a03ca9952a0..102ce033949 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -21,10 +21,15 @@
 #include "utils/memutils.h"
 #include "utils/timestamp.h"
 
+
 char* wal_acceptors_list;
 int   wal_acceptor_reconnect_timeout;
 bool  am_wal_proposer;
 
+
+/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
+WalProposerFunctionsType* WalProposerFunctions = NULL;
+
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
 static int          n_walkeepers = 0;
@@ -43,6 +48,12 @@ static int          leader;     /* Most advanced walkeeper */
 static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
+static uint32       request_poll_immediate; /* bitset of walkeepers requesting AdvancePollState */
+
+/* Declarations of a few functions ahead of time, so that we can define them out of order. */
+static void AdvancePollState(int i, uint32 events);
+static bool ReadPGAsyncIntoValue(int i, void* value, size_t value_size);
+static void HackyRemoveWalProposerEvent(int to_remove);
 
 /*
  * Combine hot standby feedbacks from all walkeepers.
@@ -72,38 +83,115 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback* hs)
 	}
 }
 
+/* Initializes the internal event set, provided that it is currently null */
 static void
-ResetWalProposerEventSet(void)
+InitEventSet(void)
 {
 	if (waitEvents)
-		FreeWaitEventSet(waitEvents);
+		elog(FATAL, "double-initialization of event set");
+
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
+}
+
+/*
+ * Updates the stored wait event for the walkeeper, given its current sockWaitState
+ *
+ * remove_if_nothing specifies whether to remove the event if the new waiting set is empty. In
+ * certain cases, we have remove_if_nothing = false because it's known that the walkeeper state will
+ * be updated immediately after if it's not waiting for any events.
+ *
+ * In general, setting remove_if_nothing = false is just an optimization; setting it to true will
+ * almost always be correct. Please leave a comment arguing for the validity of this optimization if
+ * you use it.
+ */
+static void
+UpdateEventSet(int i, bool remove_if_nothing)
+{
+	uint32 events;
+	WalKeeper* wk = &walkeeper[i];
+
+	/*
+	 * If there isn't an applicable way to update the event, we just don't bother. This function is
+	 * sometimes called when the walkeeper isn't waiting for anything, and so the best thing to do
+	 * is just nothing.
+	 */
+	if (wk->sockWaitState != WANTS_NO_WAIT)
+	{
+		events = WaitKindAsEvents(wk->sockWaitState);
+
+		/* If we don't already have an event, add one! */
+		if (wk->eventPos == -1)
+			wk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(wk->conn), NULL, wk);
+		else
+			ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+	}
+	else if (remove_if_nothing && wk->eventPos != 1)
+		HackyRemoveWalProposerEvent(i);
+}
+
+/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+ */
+static void
+HackyRemoveWalProposerEvent(int to_remove)
+{
+	/* Remove the existing event set */
+	if (waitEvents) {
+		FreeWaitEventSet(waitEvents);
+		waitEvents = NULL;
+	}
+	/* Re-initialize it without adding any walkeeper events */
+	InitEventSet();
+
+	/* loop through the existing walkeepers. If they aren't the one we're removing, and if they have
+	 * a socket we can use, re-add the applicable events.
+	 *
+	 * We're expecting that there's no other walkeepers with `.sockWaitState = WANTS_NO_WAIT`,
+	 * because any state without waiting should should have been handled immediately. */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].sock != PGINVALID_SOCKET)
+		walkeeper[i].eventPos = -1;
+
+		if (i == to_remove)
+			continue;
+
+		if (walkeeper[i].conn)
 		{
-			int events;
-			switch (walkeeper[i].state)
+			UpdateEventSet(i, false);
+
+			if (walkeeper[i].sockWaitState == WANTS_NO_WAIT)
 			{
-				case SS_SEND_WAL:
-					events = WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE;
-					break;
-				case SS_CONNECTING:
-					events = WL_SOCKET_WRITEABLE;
-					break;
-				default:
-					events = WL_SOCKET_READABLE;
-					break;
+				elog(FATAL, "Unexpected walkeeper %s:%s in %s state waiting for nothing",
+					 walkeeper[i].host, walkeeper[i].port, FormatWalKeeperState(walkeeper[i].state));
+			}
+			else
+			{
+				UpdateEventSet(i, false); /* Will either add an event or do nothing */
 			}
-			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, events, walkeeper[i].sock, NULL, &walkeeper[i]);
 		}
 	}
 }
 
+/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
+static void
+ShutdownConnection(int i, bool remove_event)
+{
+	if (walkeeper[i].conn)
+		walprop_finish(walkeeper[i].conn);
+	walkeeper[i].conn = NULL;
+	walkeeper[i].state = SS_OFFLINE;
+	walkeeper[i].pollState = SPOLL_NONE;
+	walkeeper[i].sockWaitState = WANTS_NO_WAIT;
+
+	if (remove_event)
+		HackyRemoveWalProposerEvent(i);
+}
+
 /*
  * This function is called to establish new connection or to reestablish connection in case
  * of connection failure.
@@ -112,52 +200,74 @@ ResetWalProposerEventSet(void)
 static void
 ResetConnection(int i)
 {
-	bool established;
+	pgsocket sock; /* socket of the new connection */
+	WalKeeper *wk = &walkeeper[i];
 
-	if (walkeeper[i].state != SS_OFFLINE)
+	if (wk->state != SS_OFFLINE)
 	{
-		elog(WARNING, "Connection with node %s:%s failed: %m",
-			walkeeper[i].host, walkeeper[i].port);
-
-		/* Close old connection */
-		closesocket(walkeeper[i].sock);
-		walkeeper[i].sock = PGINVALID_SOCKET;
-		walkeeper[i].state = SS_OFFLINE;
-
-		/* Postgres wait event set API doesn't support deletion of events, so we have to reconstruct set */
-		ResetWalProposerEventSet();
+		elog(WARNING, "Connection with node %s:%s in %s state failed",
+			wk->host, wk->port, FormatWalKeeperState(wk->state));
+		ShutdownConnection(i, true);
 	}
 
-	/* Try to establish new connection */
-	walkeeper[i].sock = ConnectSocketAsync(walkeeper[i].host, walkeeper[i].port, &established);
-	if (walkeeper[i].sock != PGINVALID_SOCKET)
+	/* Try to establish new connection
+	 *
+	 * If the connection information hasn't been filled out, we need to do
+	 * that here. */
+	if (wk->conninfo[0] == '\0')
 	{
-		elog(LOG, "%s with node %s:%s",
-					established ? "Connected" : "Connecting", walkeeper[i].host, walkeeper[i].port);
+		sprintf((char*) &wk->conninfo,
+				"host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
+				wk->host, wk->port, zenith_timeline_walproposer);
+	}
 
+	wk->conn = walprop_connect_start((char*) &wk->conninfo);
 
-		if (established)
-		{
-			/* Start handshake: first of all send information about server */
-			if (WriteSocket(walkeeper[i].sock, &serverInfo, sizeof serverInfo))
-			{
-				walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_READABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
-				walkeeper[i].state = SS_HANDSHAKE;
-				walkeeper[i].asyncOffs = 0;
-			}
-			else
-			{
-				ResetConnection(i);
-			}
-		}
-		else
-		{
-			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
-			walkeeper[i].state = SS_CONNECTING;
-		}
+	/* "If the result is null, then libpq has been unable to allocate a new PGconn structure" */
+	if (!wk->conn)
+		elog(FATAL, "failed to allocate new PGconn object");
+
+	/* The connection should always be non-blocking. It's easiest to just set that here. */
+	walprop_set_nonblocking(wk->conn, true);
+
+	/* PQconnectStart won't actually start connecting until we run PQconnectPoll. Before we do that
+	 * though, we need to check that it didn't immediately fail. */
+	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
+	{
+		/* According to libpq docs:
+		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed, typically
+		 *    because of invalid connection parameters."
+		 * We should report this failure.
+		 *
+		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */
+		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
+			 wk->conninfo, walprop_error_message(wk->conn));
+		/* Even though the connection failed, we still need to clean up the object */
+		walprop_finish(wk->conn);
+		wk->conn = NULL;
+		return;
 	}
-}
 
+	/* The documentation for PQconnectStart states that we should call PQconnectPoll in a loop until
+	 * it returns PGRES_POLLING_OK or PGRES_POLLING_FAILED. The other two possible returns indicate
+	 * whether we should wait for reading or writing on the socket. For the first iteration of the
+	 * loop, we're expected to wait until the socket becomes writable.
+	 *
+	 * The wording of the documentation is a little ambiguous; thankfully there's an example in the
+	 * postgres source itself showing this behavior.
+	 *   (see libpqrcv_connect, defined in
+	 *              src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
+	 */
+	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
+
+	wk->state = SS_CONNECTING;
+	wk->pollState = SPOLL_CONNECT;
+	wk->sockWaitState = WANTS_SOCK_WRITE;
+
+	sock = walprop_socket(wk->conn);
+	wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, wk);
+	return;
+}
 
 /*
  * Calculate WAL position acknowledged by quorum
@@ -241,7 +351,11 @@ WalProposerMain(Datum main_arg)
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
 
-		/* Load the libpq-specific functions */
+	/* Load the libpq-specific functions */
+	load_file("libpqwalproposer", false);
+	if (WalProposerFunctions == NULL)
+		elog(ERROR, "libpqwalproposer didn't initialize correctly");
+
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
@@ -267,7 +381,9 @@ WalProposerMain(Datum main_arg)
 		walkeeper[n_walkeepers].host = host;
 		walkeeper[n_walkeepers].port = port;
 		walkeeper[n_walkeepers].state = SS_OFFLINE;
-		walkeeper[n_walkeepers].sock = PGINVALID_SOCKET;
+		walkeeper[n_walkeepers].conn = NULL;
+		/* Set conninfo to empty. We'll fill it out once later, in `ResetConnection` as needed */
+		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		walkeeper[n_walkeepers].currMsg = NULL;
 		n_walkeepers += 1;
 	}
@@ -289,7 +405,7 @@ WalProposerMain(Datum main_arg)
 	if (*zenith_timeline_walproposer != '\0' &&
 	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
-	
+
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
@@ -306,7 +422,7 @@ WalProposerMain(Datum main_arg)
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
-	ResetWalProposerEventSet();
+	InitEventSet();
 
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
@@ -343,48 +459,38 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 SendMessageToNode(int i, WalMessage* msg)
 {
-	ssize_t rc;
+	WalKeeper* wk = &walkeeper[i];
 
 	/* If there is no pending message then send new one */
-	if (walkeeper[i].currMsg == NULL)
+	if (wk->currMsg == NULL)
 	{
 		/* Skip already acknowledged messages */
 		while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 			msg = msg->next;
 
-		walkeeper[i].currMsg = msg;
+		wk->currMsg = msg;
 	}
-	else
-		msg = walkeeper[i].currMsg;
 
-	if (msg != NULL)
+	/* Only try to send the message if it's non-null */
+	if (wk->currMsg)
 	{
-		msg->req.restartLsn = restartLsn;
-		msg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
+		wk->currMsg->req.restartLsn = restartLsn;
+		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
-		elog(LOG, "sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %d",
-			 msg->size - sizeof(WalKeeperRequest),
-			 LSN_FORMAT_ARGS(msg->req.commitLsn),
-			 LSN_FORMAT_ARGS(restartLsn),
-			 i);
+		/* Once we've selected and set up our message, actually start sending it. */
+		wk->state         = SS_SEND_WAL;
+		wk->pollState     = SPOLL_NONE;
+		wk->sockWaitState = WANTS_NO_WAIT;
+		/* Don't ned to update the event set; that's done by AdvancePollState */
 
-		rc = WriteSocketAsync(walkeeper[i].sock, &msg->req, msg->size);
-		if (rc < 0)
-		{
-			ResetConnection(i);
-		}
-		else if ((size_t)rc == msg->size) /* message was completely sent */
-		{
-			walkeeper[i].asyncOffs = 0;
-			walkeeper[i].state = SS_RECV_FEEDBACK;
-		}
-		else
-		{
-			/* wait until socket is available for write */
-			walkeeper[i].state = SS_SEND_WAL;
-			walkeeper[i].asyncOffs = rc;
-			ModifyWaitEvent(waitEvents, walkeeper[i].eventPos, WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE, NULL);
-		}
+		AdvancePollState(i, WL_NO_EVENTS);
+	}
+	else
+	{
+		wk->state         = SS_IDLE;
+		wk->pollState     = SPOLL_IDLE;
+		wk->sockWaitState = WANTS_SOCK_READ;
+		UpdateEventSet(i, true);
 	}
 }
 
@@ -549,10 +655,10 @@ TimeToReconnect(TimestampTz now)
 	return (long) (till_reconnect / 1000);
 }
 
+/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
 static void
 ReconnectWalKeepers(void)
 {
-	/* Initiate reconnect if timeout is expired */
 	TimestampTz now = GetCurrentTimestamp();
 	if (TimeToReconnect(now) == 0)
 	{
@@ -645,257 +751,726 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	return true;
 }
 
-/* Advance the WAL proposer state machine. */
+/* Requests the currently-running WalProposerPoll to advance the state of this walkeeper */
+static void
+RequestStateAdvanceNoPoll(int i)
+{
+	/* We only have to change the value here; it'll be detected in a call to
+	 * AdvancePollForAllRequested when that's made. */
+	request_poll_immediate |= (1 << i);
+}
+
+static void
+AdvancePollForAllRequested(void)
+{
+	uint32 poll_set = request_poll_immediate;
+
+	/*
+	 * We have this in a loop because -- in theory -- polling the requested states could produce
+	 * more that are ready to be polled, though this *really* shouldn't occur in practice.
+	 */
+	while ((poll_set = request_poll_immediate))
+	{
+		/* "Take responsibility" for the poll set. We don't want any possibility of other calls to
+		 * AdvancePollForAllRequested duplicating an AdvancePollState. */
+		request_poll_immediate = 0;
+
+		/*
+		 * Loop through all nonzero bits and call AdvancePollState
+		 *
+		 * FIXME: This can probably be much more efficient, using something like __builtin__clz.
+		 * Maybe it doesn't matter though.
+		 */
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			/* If the ith bit is set, that state requested advancement */
+			if (poll_set & (1 << i))
+				AdvancePollState(i, WL_NO_EVENTS);
+		}
+	}
+}
+
+/*
+ * Advance the WAL proposer state machine, waiting each time for events to occur
+ */
 void
 WalProposerPoll(void)
 {
 	while (true)
 	{
+		WalKeeper*  wk;
+		int         rc;
+		int         i;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
-		int rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-								  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		WalKeeper*  wk = (WalKeeper*) event.user_data;
-		int i = (int)(wk - walkeeper);
+
+		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+						&event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		wk = (WalKeeper*) event.user_data;
+		i = (int)(wk - walkeeper);
 
 		if (rc != 0)
 		{
-			/* communication with walkeepers */
-			if (event.events & WL_SOCKET_READABLE)
+			/*
+			 * If the event contains something that one of our walkeeper states
+			 * was waiting for, we'll advance its state.
+			 */
+			if (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+				AdvancePollState(i, event.events);
+
+			/*
+			 * It's possible for AdvancePollState to result in extra states
+			 * being ready to immediately advance to the next state (with
+			 * pollState = SPOLL_NONE). We deal with that here.
+			 */
+			AdvancePollForAllRequested();
+		}
+
+		/* If the timeout expired, attempt to reconnect to any walkeepers that we dropped */
+		ReconnectWalKeepers();
+
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (rc != 0 && (event.events & WL_LATCH_SET))
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+	}
+}
+
+/* Performs the logic for advancing the state machine of the 'i'th walkeeper, given that a certain
+ * set of events has occured. */
+static void
+AdvancePollState(int i, uint32 events)
+{
+	WalKeeper* wk = &walkeeper[i];
+
+	/* Continue polling all the while we don't need to wait.
+	 *
+	 * At the bottom of this function is "while (walkeeper[i].sockWaitState == WANTS_NO_WAIT)" */
+	do {
+		uint32 expected_events = WaitKindAsEvents(wk->sockWaitState);
+
+		/* If we were expecting SOME event but nothing happened, panic. */
+		if ((expected_events & events) == 0 && expected_events)
+		{
+			elog(FATAL,
+				 "unexpected event for WalKeeper poll. Expected %s, found code %s (see: FormatEvents).",
+				 FormatWKSockWaitKind(wk->sockWaitState), FormatEvents(events));
+		}
+
+		/* Now that we've checked the event is ok, we'll actually run the thing we're looking for */
+		switch (wk->pollState)
+		{
+			/* If the polling corresponds to a "full" operation, we'll skip straight to that - we
+			 * don't actually need to poll here. */
+			case SPOLL_NONE:
+			case SPOLL_RETRY:
+				/* Equivalent to 'break', but more descriptive. */
+				goto ExecuteNextProtocolState;
+
+			/* On idle polling states, we wait for the socket to open for reading. If this happens,
+			 * the connection has closed *normally*, so we're just done. */
+			case SPOLL_IDLE:
+				elog(LOG, "Walkeeper %s:%s closed connection from %s state",
+						wk->host, wk->port, FormatWalKeeperState(wk->state));
+				/* 'true' to remove existing event for this walkeeper */
+				ShutdownConnection(i, true);
+				return;
+
+			/* Call PQconnectPoll to finalize the connection */
+			case SPOLL_CONNECT:
 			{
-				switch (wk->state)
+				WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
+				pgsocket                         new_sock = walprop_socket(wk->conn);
+
+				switch (result)
 				{
-					case SS_HANDSHAKE:
-						/* Receive walkeeper node state */
-						rc = ReadSocketAsync(wk->sock,
-											 (char*)&wk->info + wk->asyncOffs,
-											 sizeof(wk->info) - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == sizeof(wk->info))
-						{
-							/* WalKeeper response completely received */
-
-							/* Check protocol version */
-							if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
-							{
-								elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
-									 wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
-								ResetConnection(i);
-							}
-							else
-							{
-								wk->state = SS_VOTING;
-								wk->feedback.flushLsn = restartLsn;
-								wk->feedback.hs.ts = 0;
-
-								/* Check if we have quorum */
-								if (++n_connected >= quorum)
-								{
-									if (n_connected == quorum)
-										StartElection();
-
-									/* Now send max-node-id to everyone participating in voting and wait their responses */
-									for (int j = 0; j < n_walkeepers; j++)
-									{
-										if (walkeeper[j].state == SS_VOTING)
-										{
-											if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
-											{
-												ResetConnection(j);
-											}
-											else
-											{
-												walkeeper[j].asyncOffs = 0;
-												walkeeper[j].state = SS_WAIT_VERDICT;
-											}
-										}
-									}
-								}
-							}
-						}
+					case WP_CONN_POLLING_OK:
+						elog(LOG, "Connected with node %s:%s", wk->host, wk->port);
+
+						/* If we're fully connected, we're good! We can move on to the next state */
+						wk->state = SS_EXEC_STARTWALPUSH;
+
+						/* Update the socket -- it might have changed */
+						HackyRemoveWalProposerEvent(i);
+
+						/* We need to just pick an event to wait on; this will be overriden
+						 * anyways later. */
+						wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, new_sock, NULL, wk);
+
+						/* We're done, but some of the other result cases have cleanup left to do */
+						goto ExecuteNextProtocolState;
+
+					case WP_CONN_POLLING_FAILED:
+						elog(WARNING, "Failed to connect to node '%s:%s': %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+						/* If connecting failed, we don't want to restart the connection because
+						 * that might run us into a loop. Instead, shut it down -- it'll naturally
+						 * restart at a slower interval on calls to ReconnectWalKeepers. */
+						ShutdownConnection(i, true);
+						return;
+
+					case WP_CONN_POLLING_READING:
+						wk->sockWaitState = WANTS_SOCK_READ;
 						break;
 
-					case SS_WAIT_VERDICT:
-						/* Receive walkeeper response for our candidate */
-						rc = ReadSocketAsync(wk->sock,
-											 (char*)&wk->info.server.nodeId + wk->asyncOffs,
-											 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
-						{
-							/* Response completely received */
-
-							/* If server accept our candidate, then it returns it in response */
-							if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
-							{
-								elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-									 wk->host, wk->port,
-									 wk->info.server.nodeId.term, prop.nodeId.term);
-							}
-							else
-							{
-								/* Handshake completed, do we have quorum? */
-								wk->state = SS_IDLE;
-								if (++n_votes == quorum)
-								{
-									elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
-										 quorum,
-										 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
-										);
-
-									/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-									if (restartLsn < prop.VCL)
-									{
-										elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
-											 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
-										/* Perform recovery */
-										if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
-											elog(FATAL, "Failed to recover state");
-									}
-									WalProposerStartStreaming(prop.VCL);
-									/* Should not return here */
-								}
-								else
-								{
-									/* We are already streaming WAL: send all pending messages to the attached walkeeper */
-									SendMessageToNode(i, msgQueueHead);
-								}
-							}
-						}
+					case WP_CONN_POLLING_WRITING:
+						wk->sockWaitState = WANTS_SOCK_WRITE;
 						break;
+				}
 
-					case SS_RECV_FEEDBACK:
-						/* Read walkeeper response with flushed WAL position */
-						rc = ReadSocketAsync(wk->sock,
-											 (char*)&wk->feedback + wk->asyncOffs,
-											 sizeof(wk->feedback) - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
+				/* If we got here, we either have to wait for reading or
+				 * writing. The value of walkeeper[i].sockWaitState indicates
+				 * which one of these it is.
+				 *
+				 * We also have to update the socket here, even if the file
+				 * descriptor itself hasn't changed. It's possible for libpq to
+				 * close the socket and then open a new one, reusing the same
+				 * file descriptor. If this happens, epoll will have
+				 * automatically removed the socket, so we'll stop receiving
+				 * events for it unless we re-add the socket.
+				 *
+				 * To update the socket, we the event and add a new one back.
+				 */
+				HackyRemoveWalProposerEvent(i);
+
+				wk->eventPos = AddWaitEventToSet(waitEvents, WaitKindAsEvents(wk->sockWaitState), new_sock, NULL, wk);
+
+				/* We still have polling to do, so we can't move on to the next state. */
+				return;
+			}
+
+			case SPOLL_WRITE_PQ_FLUSH:
+			{
+				int flush_result;
+
+				/* If the socket is ready for reading, we have to call PQconsumeInput before
+				 * attempting to flush. */
+				if (events & WL_SOCKET_READABLE)
+				{
+					/* PQconsumeInput returns 1 if ok, 0 if there was an error */
+					if (!walprop_consume_input(wk->conn))
+					{
+						elog(WARNING, "Failed to pre-flush read input for node %s:%s in state [%s]: %s",
+							 wk->host, wk->port, FormatWalKeeperState(wk->state),
+							 walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+					}
+				}
+
+				/* PQflush returns:
+				 *   0 if uccessful,
+				 *   1 if unable to send everything yet,
+				 *  -1 if it failed */
+				switch (flush_result = walprop_flush(wk->conn))
+				{
+					case 0:
+						/* On success, go to the next state. Our current state only indicates the
+						 * state that *started* the writing, so we need to use that to figure out
+						 * what to do next. */
+						switch (wk->state)
 						{
-							WalMessage* next = wk->currMsg->next;
-							Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
-							wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
-							wk->state = SS_IDLE;
-							wk->asyncOffs = 0;
-							wk->currMsg = NULL;
-							HandleWalKeeperResponse();
-							SendMessageToNode(i, next);
-
-							/*
-							 * Also send the new VCL to all the walkeepers.
-							 *
-							 * FIXME: This is redundant for walkeepers that have other outbound messages
-							 * pending.
-							 */
-							if (true)
-							{
-								XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-								WalMessage *vclUpdateMsg;
-
-								if (minQuorumLsn > lastSentVCLLsn)
-								{
-									vclUpdateMsg = CreateMessageVCLOnly();
-									if (vclUpdateMsg)
-										BroadcastMessage(vclUpdateMsg);
-									lastSentVCLLsn = minQuorumLsn;
-								}
-							}
+							case SS_EXEC_STARTWALPUSH:
+								wk->state = SS_WAIT_EXEC_RESULT;
+								break;
+							case SS_HANDSHAKE_SEND:
+								wk->state = SS_HANDSHAKE_RECV;
+								break;
+							case SS_SEND_VOTE:
+								wk->state = SS_WAIT_VERDICT;
+								break;
+							case SS_SEND_WAL:
+								wk->state = SS_RECV_FEEDBACK;
+								break;
+							default:
+								elog(FATAL, "Unexpected writing state [%s] for node %s:%s",
+									FormatWalKeeperState(wk->state), wk->host, wk->port);
 						}
+
+						wk->pollState = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
 						break;
-					case SS_IDLE:
-						elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
+					case 1:
+						/* Nothing more to do - we'll just have to wait until we can flush again */
+						return;
+					case -1:
+						elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+							 wk->host, wk->port, FormatWalKeeperState(wk->state),
+							 walprop_error_message(wk->conn));
 						ResetConnection(i);
 						break;
-
 					default:
-						elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+						elog(FATAL, "invalid return %d from PQflush", flush_result);
 				}
+				break;
 			}
-			else if (event.events & WL_SOCKET_WRITEABLE)
+
+			case SPOLL_PQ_CONSUME_AND_RETRY:
+				/* PQconsumeInput returns 1 on success (though maybe nothing was read), and 0 on
+				 * failure. */
+				if (walprop_consume_input(wk->conn))
+					/* On success, retry the operation */
+					goto ExecuteNextProtocolState;
+				else
+				{
+					/* On failure, print the failure and move on */
+					elog(WARNING, "Failed to read input for node %s:%s in state %s: %s",
+						wk->host, wk->port, FormatWalKeeperState(wk->state),
+						walprop_error_message(wk->conn));
+					ResetConnection(i);
+					return;
+				}
+		}
+
+ExecuteNextProtocolState:
+		/* If we get here, walkeeper[i].pollState now corresponds to either SPOLL_NONE or
+		 * SPOLL_RETRY. In either case, we should execute the operation described by the high-level
+		 * state.
+		 *
+		 * All of the cases in this switch statement are provided in the order that state
+		 * transitions happen, moving downwards. So `SS_CONNECTING` moves into
+		 * `SS_EXEC_STARTWALPUSH`, `SS_EXEC_STARTWALPUSH` moves into `SS_WAIT_EXEC_RESULT`, etc.
+		 *
+		 * If/when new states are added, they should abide by the same formatting.
+		 *
+		 * More information about the high-level flow between states is available in the comments
+		 * for WalKeeperState. */
+		switch (wk->state)
+		{
+			/* walkeepers aren't taken out of SS_OFFLINE by polling. */
+			case SS_OFFLINE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* Connecting is handled by the SPOLL_CONNECT, which then puts us into
+			 * SS_EXEC_STARTWALPUSH. There's no singular state advancement to be made here. */
+			case SS_CONNECTING:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is connecting", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* Send "START_WAL_PUSH" command to the walkeeper. After sending, wait for response with
+			 * SS_WAIT_EXEC_RESULT */
+			case SS_EXEC_STARTWALPUSH:
 			{
-				switch (wk->state)
+				int flush_result;
+
+				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
+				{
+					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+					ResetConnection(i);
+					return;
+				}
+
+				/* The query has been started (put into buffers), but hasn't been flushed yet. We
+				 * should do that now. If there's more flushing required, keep doing that until it's
+				 * done */
+				switch ((flush_result = walprop_flush(wk->conn)))
+				{
+					case 0:
+						/* success -- go to the next state */
+						wk->state = SS_WAIT_EXEC_RESULT;
+						wk->pollState = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+					case 1:
+						/* we'll have to flush again */
+						wk->pollState = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+					case -1:
+						elog(WARNING, "Failed to flush write to node %s:%s to exec command: %s",
+								wk->host, wk->port, walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+					default:
+						elog(FATAL, "invalid return %d from PQflush", flush_result);
+				}
+
+				/* If no waiting is required, we'll get to that shortly */
+				UpdateEventSet(i, false);
+				break;
+			}
+
+			/* Waiting for the result of the "START_WAL_PUSH" command. If successful, proceed to
+			 * SS_HANDSHAKE_SEND. If needs more, wait until we can read and retry. */
+			case SS_WAIT_EXEC_RESULT:
+				/* Call our wrapper around PQisBusy + PQgetResult to inspect the result */
+				switch (walprop_get_query_result(wk->conn))
+				{
+					/* Successful result, move on to starting the handshake */
+					case WP_EXEC_SUCCESS_COPYBOTH:
+						wk->state         = SS_HANDSHAKE_SEND;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+
+					/* We need more calls to PQconsumeInput to completely receive this result */
+					case WP_EXEC_NEEDS_INPUT:
+						wk->pollState     = SPOLL_PQ_CONSUME_AND_RETRY;
+						wk->sockWaitState = WANTS_SOCK_READ;
+						break;
+
+					case WP_EXEC_FAILED:
+						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
+								wk->host, wk->port, walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+
+					/* Unexpected result -- funamdentally an error, but we want to produce a custom
+					 * message, rather than a generic "something went wrong" */
+					case WP_EXEC_UNEXPECTED_SUCCESS:
+						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
+								wk->host, wk->port);
+						ResetConnection(i);
+						break;
+				}
+
+				/* If the wait state is empty, don't remove the event -- we have more work to do */
+				UpdateEventSet(i, false);
+
+				break;
+
+			/* Start handshake: first of all send information about server */
+			case SS_HANDSHAKE_SEND:
+				/* Note: This state corresponds to the process of sending the relevant information
+				 * along. The moment we finish sending, we use SS_HANDSHAKE_RECV to complete the
+				 * handshake. */
+				switch (walprop_async_write(wk->conn, &serverInfo, sizeof(serverInfo)))
 				{
-					case SS_CONNECTING:
+					case PG_ASYNC_WRITE_SUCCESS:
+						/* If the write immediately succeeds, we can move on to the next state. */
+						wk->state         = SS_HANDSHAKE_RECV;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+
+					case PG_ASYNC_WRITE_WOULDBLOCK:
+						/* Wait until the socket is write-ready and try again */
+						wk->pollState     = SPOLL_RETRY;
+						wk->sockWaitState = WANTS_SOCK_WRITE;
+						break;
+
+					case PG_ASYNC_WRITE_TRY_FLUSH:
+						/* We need to call PQflush some number of additional times, with different
+						 * actions depending on whether the socket is readable or writable */
+						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+
+					case PG_ASYNC_WRITE_FAIL:
+						/* On failure, print the error and reset the connection */
+						elog(WARNING, "Handshake with node %s:%s failed to start: %s",
+								wk->host, wk->port, walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+				}
+
+				/* Update the event set for this walkeeper, depending on what it's been changed to
+				 *
+				 * We set remove_if_nothing = false because we'll immediately execute
+				 * SS_HANDSHAKE_RECV on the next iteration of the outer loop. */
+				UpdateEventSet(i, false);
+				break;
+
+			/* Finish handshake comms: receive information about the walkeeper */
+			case SS_HANDSHAKE_RECV:
+				/* If our reading doesn't immediately succeed, any necessary error handling or state
+				 * setting is taken care of. We can leave any other work until later. */
+				if (!ReadPGAsyncIntoValue(i, &wk->info, sizeof(wk->info)))
+					return;
+
+				/* Check protocol version */
+				if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
+				{
+					elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
+							wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
+					ResetConnection(i);
+					return;
+				}
+
+				/* Protocol is all good, move to voting */
+				wk->state     = SS_VOTING;
+				wk->pollState = SPOLL_IDLE;
+				wk->feedback.flushLsn = restartLsn;
+				wk->feedback.hs.ts = 0;
+
+				/* Check if we have quorum. If there aren't enough walkeepers, wait and do nothing.
+				 * We'll eventually get a task when the election starts.
+				 *
+				 * If we do have quorum, we can start an election */
+				if (++n_connected >= quorum)
+				{
+					if (n_connected == quorum)
+						StartElection();
+
+					/* Now send max-node-id to everyone participating in voting and wait their responses */
+					for (int j = 0; j < n_walkeepers; j++)
 					{
-						int			optval = 0;
-						ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
-						if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
+						/* Remember: SS_VOTING indicates that the walkeeper is participating in
+						 * voting, but hasn't sent anything yet. The ones that have sent something
+						 * are given SS_SEND_VOTE or SS_WAIT_VERDICT. */
+						if (walkeeper[j].state == SS_VOTING)
 						{
-							elog(WARNING, "Failed to connect to node '%s:%s': %s",
-								 wk->host, wk->port,
-								 strerror(optval));
-							closesocket(wk->sock);
-							wk->sock =  PGINVALID_SOCKET;
-							wk->state = SS_OFFLINE;
-							ResetWalProposerEventSet();
-						}
-						else
-						{
-							uint32 len = 0;
-							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-							/*
-							 * Start handshake: send information about server.
-							 * First of all send 0 as package size: it allows walkeeper to distinguish
-							 * wal_proposer's connection from standard replication connection from pagers.
-							 */
-							if (WriteSocket(wk->sock, &len, sizeof len)
-								&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
-							{
-								wk->state = SS_HANDSHAKE;
-								wk->asyncOffs = 0;
-							}
-							else
-							{
-								ResetConnection(i);
-							}
+							walkeeper[j].state = SS_SEND_VOTE;
+							walkeeper[j].pollState = SPOLL_NONE;
+							walkeeper[j].sockWaitState = WANTS_NO_WAIT;
+
+							/* If this isn't the current walkeeper, defer handling this state until
+							 * later. We'll mark it for individual work in WalProposerPoll. */
+							if (j != i)
+								RequestStateAdvanceNoPoll(j);
 						}
+					}
+				}
+				break;
+
+			/* Voting is an idle state - we don't expect any events to trigger. Refer to the
+			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
+			 * SS_SEND_VOTE. */
+			case SS_VOTING:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* We have quorum for voting, send our vote request */
+			case SS_SEND_VOTE:
+				switch (walprop_async_write(wk->conn, &prop, sizeof(prop)))
+				{
+					case PG_ASYNC_WRITE_SUCCESS:
+						/* If the write immediately succeeds, we can move on to the next state. */
+						wk->state         = SS_WAIT_VERDICT;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+					case PG_ASYNC_WRITE_WOULDBLOCK:
+						/* Wait until the socket is write-ready and try again */
+						wk->pollState     = SPOLL_RETRY;
+						wk->sockWaitState = WANTS_SOCK_WRITE;
 						break;
+					case PG_ASYNC_WRITE_TRY_FLUSH:
+						/* We need to call PQflush some number of additional times, with different
+						 * actions depending on whether the socket is readable or writable */
+						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+					case PG_ASYNC_WRITE_FAIL:
+						/* Report the failure and reset the connection; there isn't much
+						 * more we can do. */
+						elog(WARNING, "Failed to send vote request to node %s:%s: %s",
+								wk->host, wk->port,
+								walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+				}
+
+				/* Don't remove from the event set if there's nothing we're waiting for; we'll get
+				 * it on the next iteration of the loop */
+				UpdateEventSet(i, false);
+				break;
+
+			/* Start reading the walkeeper response for our candidate */
+			case SS_WAIT_VERDICT:
+				/* If our reading doesn't immediately succeed, any necessary error handling or state
+				 * setting is taken care of. We can leave any other work until later. */
+				if (!ReadPGAsyncIntoValue(i, &wk->info.server.nodeId, sizeof(wk->info.server.nodeId)))
+					return;
+
+				/* If server accept our candidate, then it returns it in response */
+				if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+				{
+					elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+						wk->host, wk->port,
+						wk->info.server.nodeId.term, prop.nodeId.term);
+				}
+
+				/* Handshake completed, do we have quorum? */
+				wk->state         = SS_IDLE;
+				wk->pollState     = SPOLL_IDLE;
+				wk->sockWaitState = WANTS_NO_WAIT;
+
+				if (++n_votes == quorum)
+				{
+					elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
+						 quorum,
+						 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
+						);
+
+					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+					if (restartLsn < prop.VCL)
+					{
+						elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
+							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
+						/* Perform recovery */
+						if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+							elog(FATAL, "Failed to recover state");
 					}
+					WalProposerStartStreaming(prop.VCL);
+					/* Should not return here */
+				}
+				else
+				{
+					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+					SendMessageToNode(i, msgQueueHead);
+				}
 
-					case SS_SEND_WAL:
-						rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == wk->currMsg->size)
-						{
-							/* WAL block completely sent */
-							wk->state = SS_RECV_FEEDBACK;
-							wk->asyncOffs = 0;
-							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-						}
+				break;
+
+			/* Start to send the message at wk->currMsg. Triggered only by calls to
+			 * SendMessageToNode */
+			case SS_SEND_WAL:
+			{
+				WalMessage* msg = wk->currMsg;
+
+				/* Don't repeat logs if we have to retry the actual send operation itself */
+				if (wk->pollState != SPOLL_RETRY)
+				{
+					elog(LOG, "Sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %s:%s",
+						 msg->size - sizeof(WalKeeperRequest),
+						 LSN_FORMAT_ARGS(msg->req.commitLsn),
+						 LSN_FORMAT_ARGS(restartLsn),
+						 wk->host, wk->port);
+				}
+
+				switch (walprop_async_write(wk->conn, &msg->req, msg->size))
+				{
+					case PG_ASYNC_WRITE_SUCCESS:
+						wk->state         = SS_RECV_FEEDBACK;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+					case PG_ASYNC_WRITE_WOULDBLOCK:
+						wk->pollState = SPOLL_RETRY;
+						wk->sockWaitState = WANTS_SOCK_WRITE;
 						break;
+					case PG_ASYNC_WRITE_TRY_FLUSH:
+						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+					case PG_ASYNC_WRITE_FAIL:
+						elog(WARNING, "Failed to send WAL to node %s:%s: %s",
+							 wk->host, wk->port, walprop_error_message(wk->conn));
+				}
 
-					default:
-						elog(FATAL, "Unexpected write state %d", wk->state);
+				/* Don't remove if if sockWaitState == WANTS_NO_WAIT, because we'll immediately move
+				 * on to SS_RECV_FEEDBACK if that's the case. */
+				UpdateEventSet(i, false);
+				break;
+			}
+
+			/* Start to receive the feedback from a message sent via SS_SEND_WAL */
+			case SS_RECV_FEEDBACK:
+			{
+				WalMessage* next;
+				XLogRecPtr  minQuorumLsn;
+				WalMessage* vclUpdateMsg;
+
+				/* If our reading doesn't immediately succeed, any necessary error handling or state
+				 * setting is taken care of. We can leave any other work until later. */
+				if (!ReadPGAsyncIntoValue(i, &wk->feedback, sizeof(wk->feedback)))
+					return;
+
+				next = wk->currMsg->next;
+				Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
+				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+
+				wk->state         = SS_IDLE;
+				wk->pollState     = SPOLL_IDLE;
+				wk->sockWaitState = WANTS_NO_WAIT;
+				/* Don't update the event set; that's handled by SendMessageToNode if necessary */
+
+				wk->currMsg = NULL;
+				HandleWalKeeperResponse();
+				SendMessageToNode(i, next);
+
+				/*
+				 * Also send the new VCL to all the walkeepers.
+				 *
+				 * FIXME: This is redundant for walkeepers that have other outbound messages
+				 * pending.
+				 */
+				minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+
+				if (minQuorumLsn > lastSentVCLLsn)
+				{
+					vclUpdateMsg = CreateMessageVCLOnly();
+					if (vclUpdateMsg)
+						BroadcastMessage(vclUpdateMsg);
+					lastSentVCLLsn = minQuorumLsn;
 				}
+				break;
 			}
+
+			/* Truly an idle state - there isn't any typ of advancement expected here. */
+			case SS_IDLE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
+				break; /* actually unreachable; makes the compiler happier */
 		}
-		ReconnectWalKeepers();
 
-		/*
-		 * If wait is terminated by latch set (walsenders' latch is set on
-		 * each wal flush), then exit loop. (no need for pm death check due to
-		 * WL_EXIT_ON_PM_DEATH)
-		 */
-		if (rc != 0 && (event.events & WL_LATCH_SET))
-		{
-			ResetLatch(MyLatch);
+		/* On subsequent iterations of the loop, there's no additonal events to process */
+		events = WL_NO_EVENTS;
+	} while (walkeeper[i].sockWaitState == WANTS_NO_WAIT && walkeeper[i].pollState != SPOLL_IDLE);
+}
+
+/*
+ * Reads a CopyData block into a value, returning whether the read was successful
+ *
+ * If the read was not immediately successful (either polling is required, or it actually failed),
+ * then the state is set appropriately on the walkeeper.
+ */
+bool
+ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
+{
+	WalKeeper* wk = &walkeeper[i];
+	char *buf = NULL;
+	int buf_size = -1;
+
+	switch (walprop_async_read(wk->conn, &buf, &buf_size))
+	{
+		/* On success, there's just a couple more things we'll check below */
+		case PG_ASYNC_READ_SUCCESS:
 			break;
-		}
+
+		case PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN:
+			wk->pollState = SPOLL_PQ_CONSUME_AND_RETRY;
+
+			if (wk->sockWaitState != WANTS_SOCK_READ)
+			{
+				wk->sockWaitState = WANTS_SOCK_READ;
+				UpdateEventSet(i, true);
+			}
+			return false;
+
+		case PG_ASYNC_READ_FAIL:
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s",
+				wk->host, wk->port,
+				FormatWalKeeperState(wk->state),
+				walprop_error_message(wk->conn));
+			ResetConnection(i);
+			return false;
 	}
-}
 
+	/*
+	 * If we get here, the read was ok, but we still need to check it was the right amount
+	 */
+	if (buf_size != value_size)
+	{
+		elog(FATAL,
+			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
+			wk->host, wk->port,
+			FormatWalKeeperState(wk->state),
+			sizeof(wk->info.server.nodeId), buf_size);
+	}
+
+	/* Copy the resulting info into place */
+	memcpy(value, buf, buf_size);
+	return true;
+}
 
 /*
  * WalProposerRegister
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index cea41ef01cc..722fa66d5e6 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -1,6 +1,7 @@
 #include "replication/walproposer.h"
 #include "common/logging.h"
 #include "common/ip.h"
+#include "../interfaces/libpq/libpq-fe.h"
 #include <netinet/tcp.h>
 #include <unistd.h>
 
@@ -28,170 +29,166 @@ CompareLsn(const void *a, const void *b)
 		return 1;
 }
 
-static bool
-SetSocketOptions(pgsocket sock)
+/* Converts a `WKSockWaitKind` into the bit flags that would match it
+ * 
+ * Note: For `wait_kind = WANTS_NO_WAIT`, this will return a value of zero,
+ * which does not match any events. Attempting to wait on no events will
+ * always timeout, so it's best to double-check the value being provided to
+ * this function where necessary. */
+uint32
+WaitKindAsEvents(WKSockWaitKind wait_kind)
 {
-	int on = 1;
-	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
-				   (char *) &on, sizeof(on)) < 0)
-	{
-		elog(WARNING, "setsockopt(TCP_NODELAY) failed: %m");
-		closesocket(sock);
-		return false;
-	}
-	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				   (char *) &on, sizeof(on)) < 0)
-	{
-		elog(WARNING, "setsockopt(SO_REUSEADDR) failed: %m");
-		closesocket(sock);
-		return false;
-	}
-	if (!pg_set_noblock(sock))
-	{
-		elog(WARNING, "faied to switch socket to non-blocking mode: %m");
-		closesocket(sock);
-		return false;
-	}
-	return true;
-}
+	uint32 return_val;
 
-pgsocket
-ConnectSocketAsync(char const* host, char const* port, bool* established)
-{
-	struct addrinfo *addrs = NULL,
-		*addr,
-		hints;
-	int	ret;
-	pgsocket sock = PGINVALID_SOCKET;
-
-	hints.ai_flags = AI_PASSIVE;
-	hints.ai_family = AF_UNSPEC;
-	hints.ai_socktype = SOCK_STREAM;
-	hints.ai_protocol = 0;
-	hints.ai_addrlen = 0;
-	hints.ai_addr = NULL;
-	hints.ai_canonname = NULL;
-	hints.ai_next = NULL;
-	ret = pg_getaddrinfo_all(host, port, &hints, &addrs);
-	if (ret || !addrs)
-	{
-		elog(WARNING, "Could not resolve \"%s\": %s",
-					 host, gai_strerror(ret));
-		return -1;
-	}
-	for (addr = addrs; addr; addr = addr->ai_next)
+	switch (wait_kind)
 	{
-		sock = socket(addr->ai_family, SOCK_STREAM, 0);
-		if (sock == PGINVALID_SOCKET)
-		{
-			elog(WARNING, "could not create socket: %m");
-			continue;
-		}
-		if (!SetSocketOptions(sock))
-			continue;
-
-		/*
-		 * Bind it to a kernel assigned port on localhost and get the assigned
-		 * port via getsockname().
-		 */
-		while ((ret = connect(sock, addr->ai_addr, addr->ai_addrlen)) < 0 && errno == EINTR);
-		if (ret < 0)
-		{
-			if (errno == EINPROGRESS)
-			{
-				*established = false;
-				break;
-			}
-			elog(WARNING, "Could not establish connection to %s:%s: %m",
-						 host, port);
-			closesocket(sock);
-		}
-		else
-		{
-			*established = true;
-			break;
-		}
+		case WANTS_NO_WAIT:
+			return_val = WL_NO_EVENTS;
+			break;
+		case WANTS_SOCK_READ:
+			return_val = WL_SOCKET_READABLE;
+			break;
+		case WANTS_SOCK_WRITE:
+			return_val = WL_SOCKET_WRITEABLE;
+			break;
+		case WANTS_SOCK_EITHER:
+			return_val = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
 	}
-	return sock;
+
+	return return_val;
 }
-ssize_t
-ReadSocketAsync(pgsocket sock, void* buf, size_t size)
+
+/* Returns a human-readable string corresonding to the WalKeeperState
+ *
+ * The string should not be freed.
+ *
+ * The strings are intended to be used as a prefix to "state", e.g.:
+ *
+ *   elog(LOG, "currently in %s state", FormatWalKeeperState(wk->state));
+ *
+ * If this sort of phrasing doesn't fit the message, instead use something like:
+ *
+ *   elog(LOG, "currently in state [%s]", FormatWalKeeperState(wk->state));
+ */
+char*
+FormatWalKeeperState(WalKeeperState state)
 {
-	size_t offs = 0;
+	char* return_val;
 
-	while (size != offs)
+	switch (state)
 	{
-		ssize_t rc = recv(sock, (char*)buf + offs, size - offs, 0);
-		if (rc < 0)
-		{
-			if (errno == EINTR)
-				continue;
-			if (errno == EAGAIN || errno == EWOULDBLOCK)
-				return offs;
-			elog(WARNING, "Socket write failed: %m");
-			return -1;
-		}
-		else if (rc == 0)
-		{
-			elog(WARNING, "Connection was closed by peer");
-			return -1;
-		}
-		offs += rc;
+		case SS_OFFLINE:
+			return_val = "offline";
+			break;
+		case SS_CONNECTING:
+			return_val = "connecting";
+			break;
+		case SS_EXEC_STARTWALPUSH:
+			return_val = "sending 'START_WAL_PUSH' query";
+			break;
+		case SS_WAIT_EXEC_RESULT:
+			return_val = "receiving query result";
+			break;
+		case SS_HANDSHAKE_SEND:
+			return_val = "handshake (sending)";
+			break;
+		case SS_HANDSHAKE_RECV:
+			return_val = "handshake (receiving)";
+			break;
+		case SS_VOTING:
+			return_val = "voting";
+			break;
+		case SS_SEND_VOTE:
+			return_val = "sending vote";
+			break;
+		case SS_WAIT_VERDICT:
+			return_val = "wait-for-verdict";
+			break;
+		case SS_IDLE:
+			return_val = "idle";
+			break;
+		case SS_SEND_WAL:
+			return_val = "WAL-sending";
+			break;
+		case SS_RECV_FEEDBACK:
+			return_val = "WAL-feedback-receiving";
+			break;
 	}
-	return offs;
+
+	return return_val;
 }
 
-ssize_t
-WriteSocketAsync(pgsocket sock, void const* buf, size_t size)
+/* Returns a human-readable string corresponding to the WKSockWaitKind
+ *
+ * The string should not be freed. */
+char*
+FormatWKSockWaitKind(WKSockWaitKind wait_kind)
 {
-	size_t offs = 0;
+	char* return_val;
 
-	while (size != offs)
+	switch (wait_kind)
 	{
-		ssize_t rc = send(sock, (char const*)buf + offs, size - offs, 0);
-		if (rc < 0)
-		{
-			if (errno == EINTR)
-				continue;
-			if (errno == EAGAIN || errno == EWOULDBLOCK)
-				return offs;
-			elog(WARNING, "Socket write failed: %m");
-			return -1;
-		}
-		else if (rc == 0)
-		{
-			elog(WARNING, "Connection was closed by peer");
-			return -1;
-		}
-		offs += rc;
+		case WANTS_NO_WAIT:
+			return_val = "<no events>";
+			break;
+		case WANTS_SOCK_READ:
+			return_val = "<read event>";
+			break;
+		case WANTS_SOCK_WRITE:
+			return_val = "<write event>";
+			break;
+		case WANTS_SOCK_EITHER:
+			return_val = "<read or write event>";
+			break;
 	}
-	return offs;
+
+	return return_val;
 }
 
-bool
-WriteSocket(pgsocket sock, void const* buf, size_t size)
+/* Returns a human-readable string corresponding to the event set
+ *
+ * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
+ * returned string may be meaingless.
+ *
+ * The string should not be freed. It should also not be expected to remain the same between
+ * function calls. */
+char*
+FormatEvents(uint32 events)
 {
-	char* src = (char*)buf;
-
-	while (size != 0)
+	static char return_str[8];
+
+	/* Helper variable to check if there's extra bits */
+	uint32 all_flags = WL_LATCH_SET
+		| WL_SOCKET_READABLE
+		| WL_SOCKET_WRITEABLE
+		| WL_TIMEOUT
+		| WL_POSTMASTER_DEATH
+		| WL_EXIT_ON_PM_DEATH
+		| WL_SOCKET_CONNECTED;
+
+	/* The formatting here isn't supposed to be *particularly* useful -- it's just to give an
+	 * sense of what events have been triggered without needing to remember your powers of two. */
+
+	return_str[0] = (events & WL_LATCH_SET       ) ? 'L' : '_';
+	return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_';
+	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
+	return_str[3] = (events & WL_TIMEOUT         ) ? 'T' : '_';
+	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
+	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
+	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
+
+	if (events & (~all_flags))
 	{
-		ssize_t rc = send(sock, src, size, 0);
-		if (rc < 0)
-		{
-			if (errno == EINTR)
-				continue;
-			elog(WARNING, "Socket write failed: %m");
-			return false;
-		}
-		else if (rc == 0)
-		{
-			elog(WARNING, "Connection was closed by peer");
-			return false;
-		}
-		size -= rc;
-		src += rc;
+		elog(WARNING, "Event formatting found unexpected component %d",
+				events & (~all_flags));
+		return_str[6] = '*';
+		return_str[7] = '\0';
 	}
-	return true;
+	else
+		return_str[6] = '\0';
+
+	return (char *) &return_str;
 }
 
 /*
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index d770473ad35..b7b35e876e5 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -2,10 +2,12 @@
 #define __WALKEEPER_H__
 
 #include "postgres.h"
+#include "port.h"
 #include "access/xlog_internal.h"
 #include "access/transam.h"
 #include "nodes/replnodes.h"
 #include "utils/uuid.h"
+#include "replication/walreceiver.h"
 
 #define SK_MAGIC              0xCafeCeefu
 #define SK_PROTOCOL_VERSION   1
@@ -15,29 +17,240 @@
 #define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
 #define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
 
+/*
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
+ * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
+ */
+#ifndef WL_NO_EVENTS
+#define WL_NO_EVENTS 0
+#else
+#error "WL_NO_EVENTS already defined"
+#endif
+
 extern char* wal_acceptors_list;
 extern int   wal_acceptor_reconnect_timeout;
 extern bool  am_wal_proposer;
 
+struct WalProposerConn; /* Defined in libpqwalproposer */
+typedef struct WalProposerConn WalProposerConn;
+
 struct WalMessage;
 typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
 extern char *zenith_tenant_walproposer;
 
-/* WAL safekeeper state */
+/* Possible return values from ReadPGAsync */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+	/* The read is ongoing. Wait until the connection is read-ready, then
+	 * call PQconsumeInput and try again. */
+	PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from WritePGAsync */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+	/* There wasn't space in the buffers to queue the data; wait until the
+	 * socket is write-ready and try again. */
+	PG_ASYNC_WRITE_WOULDBLOCK,
+	/* The write started, but you'll need to call PQflush some more times
+	 * to finish it off. We just tried, so it's best to wait until the
+	 * connection is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
+/* WAL safekeeper state - high level */
 typedef enum
 {
+	/*
+	 * Does not have an active connection and will stay that way until
+	 * further notice. May be paired with:
+	 *   - SPOLL_NONE
+	 *
+	 * Moves to SS_CONNECTING only by calls to ResetConnection.
+	 */
 	SS_OFFLINE,
+	/*
+	 * Currently in the process of connecting. May be paired with:
+	 *   - SPOLL_CONNECT
+	 *
+	 * After the connection is made, moves to SS_EXEC_STARTWALPUSH.
+	 */
 	SS_CONNECTING,
-	SS_HANDSHAKE,
+	/*
+	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper. May be paired with:
+	 *   - SPOLL_NONE
+	 *   - SPOLL_WRITE_PQ_FLUSH
+	 *
+	 * After the query sends, moves to SS_WAIT_EXEC_RESULT.
+	 */
+	SS_EXEC_STARTWALPUSH,
+	/*
+	 * Waiting for the result of the "START_WAL_PUSH" command. May be paired with:
+	 *   - SPOLL_PQ_CONSUME_AND_RETRY
+	 *
+	 * We only pair with PQconsumeInput because we *need* to wait until the socket is open for
+	 * reading to try again.
+	 *
+	 * After we get a successful result, moves to SS_HANDSHAKE_SEND.
+	 */
+	SS_WAIT_EXEC_RESULT,
+	/*
+	 * Executing the sending half of the handshake. May be paired with:
+	 *   - SPOLL_WRITE_PQ_FLUSH if it hasn't finished sending,
+	 *   - SPOLL_RETRY          if buffers are full and we just need to try again,
+	 *   - SPOLL_NONE
+	 *
+	 * After sending, moves to SS_HANDSHAKE_RECV.
+	 */
+	SS_HANDSHAKE_SEND,
+	/*
+	 * Executing the receiving half of the handshake. May be paired with:
+	 *   - SPOLL_PQ_CONSUME_AND_RETRY if we need more input
+	 *   - SPOLL_NONE
+	 *
+	 * After receiving, moves to SS_VOTING.
+	 */
+	SS_HANDSHAKE_RECV,
+	/*
+	 * Currently participating in voting, but a quorum hasn't yet been reached. Idle state. May be
+	 * paired with:
+	 *   - SPOLL_IDLE
+	 *
+	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of SS_HANDSHAKE_RECV.
+	 */
 	SS_VOTING,
+	/*
+	 * Currently sending the assigned vote
+	 */
+	SS_SEND_VOTE,
+	/*
+	 * Sent voting information, waiting to receive confirmation from the node. May be paired with:
+	 *   - SPOLL_WRITE_PQ_FLUSH
+	 *
+	 * After receiving, moves to SS_IDLE.
+	 */
 	SS_WAIT_VERDICT,
+	/*
+	 * Waiting for quorum to send WAL. Idle state. May be paired with:
+	 *  - SPOLL_IDLE
+	 *
+	 * Moves to SS_SEND_WAL only by calls to SendMessageToNode.
+	 */
 	SS_IDLE,
+	/*
+	 * Currently sending the message at currMsg. This state is only ever reached through calls to
+	 * SendMessageToNode. May be paired with:
+	 *   - SPOLL_WRITE_PQ_FLUSH
+	 *   - SPOLL_NONE
+	 *
+	 * After sending, moves to SS_RECV_FEEDBACK.
+	 */
 	SS_SEND_WAL,
-	SS_RECV_FEEDBACK
+	/*
+	 * Currently reading feedback from sending the WAL. May be paired with:
+	 *   - SPOLL_PQ_CONSUME_AND_RETRY
+	 *   - SPOLL_NONE
+	 *
+	 * After reading, moves to (SS_SEND_WAL or SS_IDLE) by calls to
+	 * SendMessageToNode.
+	 */
+	SS_RECV_FEEDBACK,
 } WalKeeperState;
 
+/* WAL safekeeper state - individual level
+ * 
+ * This type encompasses the type of polling necessary to move on to the
+ * next `WalKeeperState` from the current. It's things like "we need to
+ * call PQflush some more", or "retry the current operation".
+ */
+typedef enum
+{
+	/*
+	 * The current state is the one we want to be in; we just haven't run
+	 * the code for it. It should be processed with AdvancePollState to
+	 * start to advance to the next state.
+	 *
+	 * Expected WKSockWaitKind: WANTS_NO_WAIT.
+	 *
+	 * Note! This polling state is different from the others: its attached
+	 * WalKeeperState is what *will* be executed, not what just was.
+	 */
+	SPOLL_NONE,
+	/*
+	 * We need to retry the operation once the socket permits it
+	 *
+	 * Expected WKSockWaitKind: Any of WANTS_SOCK_READ, WANTS_SOCK_WRITE,
+	 * WANTS_SOCK_EITHER -- operation dependent.
+	 */
+	SPOLL_RETRY,
+	/*
+	 * Marker for states that do not expect to be advanced by calls to AdvancePollState. Not to be
+	 * confused with SS_IDLE, which carries a different (but related) meaning.
+	 *
+	 * For this polling state, we interpret any read-readiness on the socket as an indication that
+	 * the connection has closed normally.
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_READ
+	 */
+	SPOLL_IDLE,
+	/*
+	 * We need to repeat calls to PQconnectPoll. This is only available for
+	 * SS_CONNECTING
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_READ or WANTS_SOCK_WRITE
+	 */
+	SPOLL_CONNECT,
+	/* Poll with PQflush, finishing up a call to WritePGAsync. Always
+	 * combined with writing states, like SS_HANDSHAKE_SEND or SS_SEND_WAL.
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_EITHER
+	 */
+	SPOLL_WRITE_PQ_FLUSH,
+	/*
+	 * Get input with PQconsumeInput and try the operation again. This is
+	 * always combined with reading states -- like SS_HANDSHAKE_RECV or
+	 * SS_WAIT_VERDICT, and the operation repetition helps to reduce the
+	 * amount of repeated logic.
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_READ
+	 */
+	SPOLL_PQ_CONSUME_AND_RETRY,
+} WalKeeperPollState;
+
+/* The state of the socket that we're waiting on. This is used to
+ * double-check for polling that the socket we're being handed is correct.
+ *
+ * Used in the sockWaitState field of WalKeeper, in combination with the
+ * WalKeeperPollState.
+ *
+ * Each polling state above lists the set of values that they accept. */
+typedef enum
+{
+	/* No waiting is required for the poll state */
+	WANTS_NO_WAIT,
+	/* Polling should resume only once the socket is ready for reading */
+	WANTS_SOCK_READ,
+	/* Polling should resume only once the socket is ready for writing */
+	WANTS_SOCK_WRITE,
+	/* Polling should resume once the socket is ready for reading or
+	 * writing */
+	WANTS_SOCK_EITHER,
+} WKSockWaitKind;
+
 /*
  * Unique node identifier used by Paxos
  */
@@ -58,7 +271,7 @@ typedef struct ServerInfo
 	uint64     systemId;          /* Postgres system identifier */
 	uint8	   ztimelineid[16];   /* Zenith timeline id */
 	XLogRecPtr walEnd;
-    TimeLineID timeline;
+	TimeLineID timeline;
 	int        walSegSize;
 	uint8      ztenantid[16];
 } ServerInfo;
@@ -120,6 +333,12 @@ struct WalMessage
 	uint32 size;           /* message size */
 	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
 	WalKeeperRequest req; /* request to walkeeper (message header) */
+
+	/* PHANTOM FIELD:
+	 *
+	 * All WalMessages are allocated with exactly (size - sizeof(WalKeeperRequest)) additional bytes
+	 * after them, containing the body of the message. This allocation is done in `CreateMessage`
+	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
 };
 
 /*
@@ -138,24 +357,29 @@ typedef struct WalKeeperResponse
  */
 typedef struct WalKeeper
 {
-    char const* host;
-    char const* port;
-	pgsocket    sock;     /* socket descriptor */
-	WalMessage* currMsg;  /* message been send to the receiver */
-	int         asyncOffs;/* offset for asynchronus read/write operations */
-	int         eventPos; /* position in wait event set */
-	WalKeeperState state;/* walkeeper state machine state */
-    WalKeeperInfo  info; /* walkeeper info */
-	WalKeeperResponse feedback; /* feedback to master */
+	char const*        host;
+	char const*        port;
+	char               conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
+	WalProposerConn*   conn;          /* postgres protocol connection to the walreceiver */
+
+	WalMessage*        currMsg;       /* message been send to the receiver */
+
+	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
+	WalKeeperState     state;         /* walkeeper state machine state */
+	WalKeeperPollState pollState;     /* what kind of polling is necessary to advance `state` */
+	WKSockWaitKind     sockWaitState; /* what state are we expecting the socket to be in for
+									     the polling required? */
+	WalKeeperInfo      info;          /* walkeeper info */
+	WalKeeperResponse  feedback;      /* feedback to master */
 } WalKeeper;
 
 
 int        CompareNodeId(NodeId* id1, NodeId* id2);
-pgsocket   ConnectSocketAsync(char const* host, char const* port, bool* established);
-bool       WriteSocket(pgsocket sock, void const* buf, size_t size);
-ssize_t    ReadSocketAsync(pgsocket sock, void* buf, size_t size);
-ssize_t    WriteSocketAsync(pgsocket sock, void const* buf, size_t size);
 int        CompareLsn(const void *a, const void *b);
+uint32     WaitKindAsEvents(WKSockWaitKind wait_kind);
+char*      FormatWalKeeperState(WalKeeperState state);
+char*      FormatWKSockWaitKind(WKSockWaitKind wait_kind);
+char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
 bool       HexDecodeString(uint8 *result, char *input, int nbytes);
@@ -173,4 +397,168 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									uint32		feedbackCatalogEpoch);
 void       StartReplication(StartReplicationCmd *cmd);
 
+/* libpqwalproposer hooks & helper type */
+
+/* Re-exported PostgresPollingStatusType */
+typedef enum
+{
+	WP_CONN_POLLING_FAILED = 0,
+	WP_CONN_POLLING_READING,
+	WP_CONN_POLLING_WRITING,
+	WP_CONN_POLLING_OK,
+	/*
+	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
+	 * We've removed it here to avoid clutter.
+	 */
+} WalProposerConnectPollStatusType;
+
+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+	/* Any success result other than a single CopyBoth was received. The specifics of the result
+	 * were already logged, but it may be useful to provide an error message indicating which
+	 * walkeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set. */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+	/* No result available at this time. Wait until read-ready, call PQconsumeInput, then try again.
+	 * Internally, this is returned when PQisBusy indicates that PQgetResult would block. */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
+/* Re-exported ConnStatusType */
+typedef enum
+{
+	WP_CONNECTION_OK,
+	WP_CONNECTION_BAD,
+
+	/*
+	 * The original ConnStatusType has many more tags, but requests that
+	 * they not be relied upon (except for displaying to the user). We
+	 * don't need that extra functionality, so we collect them into a
+	 * single tag here.
+	 */
+	WP_CONNECTION_IN_PROGRESS,
+} WalProposerConnStatusType;
+
+/* Re-exported PQerrorMessage */
+typedef char* (*walprop_error_message_fn) (WalProposerConn* conn);
+
+/* Re-exported PQstatus */
+typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn);
+
+/* Re-exported PQconnectStart */
+typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
+
+/* Re-exported PQconectPoll */
+typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
+
+/* Re-exported PQsendQuery */
+typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
+
+/* Wrapper around PQisBusy + PQgetResult */
+typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
+
+/* Re-exported PQsetnonblocking */
+typedef int (*walprop_set_nonblocking_fn) (WalProposerConn* conn, int arg);
+
+/* Re-exported PQsocket */
+typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
+
+/* Re-exported PQflush */
+typedef int (*walprop_flush_fn) (WalProposerConn* conn);
+
+/* Re-exported PQconsumeInput */
+typedef int (*walprop_consume_input_fn) (WalProposerConn* conn);
+
+/* Re-exported PQfinish */
+typedef void (*walprop_finish_fn) (WalProposerConn* conn);
+
+/*
+ * Ergonomic wrapper around PGgetCopyData
+ *
+ * Reads a CopyData block from a walkeeper, setting *amount to the number
+ * of bytes returned.
+ *
+ * This function is allowed to assume certain properties specific to the
+ * protocol with the walkeepers, so it should not be used as-is for any
+ * other purpose.
+ *
+ * Note: If possible, using <ReadPGAsyncIntoValue> is generally preferred,
+ * because it performs a bit of extra checking work that's always required
+ * and is normally somewhat verbose.
+ */
+typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
+													char** buf,
+													int* amount);
+
+/*
+ * Ergonomic wrapper around PQputCopyData + PQflush
+ *
+ * Starts to write a CopyData block to a walkeeper.
+ *
+ * For information on the meaning of return codes, refer to PGAsyncWriteResult.
+ */
+typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
+													  void const* buf,
+													  size_t size);
+
+/* All libpqwalproposer exported functions collected together. */
+typedef struct WalProposerFunctionsType
+{
+	walprop_error_message_fn	walprop_error_message;
+	walprop_status_fn			walprop_status;
+	walprop_connect_start_fn	walprop_connect_start;
+	walprop_connect_poll_fn		walprop_connect_poll;
+	walprop_send_query_fn		walprop_send_query;
+	walprop_get_query_result_fn	walprop_get_query_result;
+	walprop_set_nonblocking_fn  walprop_set_nonblocking;
+	walprop_socket_fn			walprop_socket;
+	walprop_flush_fn			walprop_flush;
+	walprop_consume_input_fn	walprop_consume_input;
+	walprop_finish_fn			walprop_finish;
+	walprop_async_read_fn		walprop_async_read;
+	walprop_async_write_fn		walprop_async_write;
+} WalProposerFunctionsType;
+
+/* Allow the above functions to be "called" with normal syntax */
+#define walprop_error_message(conn) \
+	WalProposerFunctions->walprop_error_message(conn)
+#define walprop_status(conn) \
+	WalProposerFunctions->walprop_status(conn)
+#define walprop_connect_start(conninfo) \
+	WalProposerFunctions->walprop_connect_start(conninfo)
+#define walprop_connect_poll(conn) \
+	WalProposerFunctions->walprop_connect_poll(conn)
+#define walprop_send_query(conn, query) \
+	WalProposerFunctions->walprop_send_query(conn, query)
+#define walprop_get_query_result(conn) \
+	WalProposerFunctions->walprop_get_query_result(conn)
+#define walprop_set_nonblocking(conn, arg) \
+	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
+#define walprop_socket(conn) \
+	WalProposerFunctions->walprop_socket(conn)
+#define walprop_flush(conn) \
+	WalProposerFunctions->walprop_flush(conn)
+#define walprop_consume_input(conn) \
+	WalProposerFunctions->walprop_consume_input(conn)
+#define walprop_finish(conn) \
+	WalProposerFunctions->walprop_finish(conn)
+#define walprop_async_read(conn, buf, amount) \
+	WalProposerFunctions->walprop_async_read(conn, buf, amount)
+#define walprop_async_write(conn, buf, size) \
+	WalProposerFunctions->walprop_async_write(conn, buf, size)
+
+/*
+ * The runtime location of the libpqwalproposer functions.
+ *
+ * This pointer is set by the initializer in libpqwalproposer, so that we
+ * can use it later.
+ */
+extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
+
 #endif

From 24bb700626cb63a61668e4aaa6de6a36b8333318 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 16 Aug 2021 13:19:42 +0300
Subject: [PATCH 034/214] Build zenithdb/compute-node:latest in CI
 (zenithdb/console#125)

---
 .circleci/config.yml | 32 +++++++++++++++++++
 .dockerignore        |  5 +++
 Dockerfile           | 74 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 .circleci/config.yml
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000000..ad48e5ac396
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,32 @@
+version: 2.1
+
+jobs:
+
+  # Build zenithdb/compute-node:latest image and push it to Docker hub
+  docker_image:
+    docker:
+      - image: cimg/base:2021.04
+    working_directory: ~/repo
+    steps:
+      - checkout:
+          path: ~/repo
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Build and push Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            docker build -t zenithdb/compute-node:latest . && docker push zenithdb/compute-node:latest
+
+workflows:
+  version: 2
+  compute_node:
+    jobs:
+      # Build and push image only for commits to `main`.
+      - docker_image:
+          # Context gives an ability to login
+          context: 'Docker Hub'
+          filters:
+            branches:
+              only:
+                - main
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000000..530192a3b20
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,5 @@
+.git
+.vscode
+.circleci
+tmp_install
+compute_build
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000000..83407413142
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,74 @@
+#
+# Image with pre-built tools
+#
+FROM zenithdb/compute-tools:latest AS compute-deps
+# Only to get ready apply_conf binary as a dep
+
+#
+# Image with Postgres build deps
+#
+FROM debian:buster-slim AS build-deps
+
+RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+                                          libcurl4-openssl-dev
+
+#
+# Image with built Postgres
+#
+FROM build-deps AS pg-build
+
+# Add user postgres
+RUN adduser postgres
+RUN mkdir /pg && chown postgres:postgres /pg
+
+# Copy source files
+COPY . /pg/
+
+# Build and install Postgres locally
+RUN mkdir /pg/compute_build && cd /pg/compute_build && \
+    ../configure CFLAGS='-O0 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --enable-cassert --enable-depend && \
+    # Install main binaries and contribs
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/zenith install && \
+    # Install headers
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
+
+USER postgres
+WORKDIR /pg
+
+#
+# Final compute node image to be exported
+#
+FROM debian:buster-slim
+
+# libreadline-dev is required to run psql
+RUN apt-get update && apt-get -yq install openssh-server libreadline-dev && \
+    # This will prepare everything needed by sshd
+    # like generation host keys with ssh-keygen -A
+    service ssh start
+
+# Add user postgres
+RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
+    echo "postgres:test_console_pass" | chpasswd && \
+    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres/compute && \
+    chown -R postgres:postgres /var/db/postgres/specs && \
+    chmod 0750 /var/db/postgres/compute
+
+# Copy ready Postgres binaries
+COPY --from=pg-build /pg/compute_build/postgres_bin /var/db/postgres/install
+
+# Copy apply_conf binary
+COPY --from=compute-deps /usr/local/bin/apply_conf /usr/local/bin/apply_conf
+
+# Copy postgres binaries to the common location
+RUN cp /var/db/postgres/install/bin/* /usr/local/bin/ && \
+    cp -r /var/db/postgres/install/share/* /usr/local/share/ && \
+    # Add postgres shared objects to the search path
+    echo '/var/db/postgres/install/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+
+# To be able to run sshd (seems to be default)
+# USER root
+
+ENTRYPOINT ["/bin/sh"]

From 5c17aba6a5580ea4767fca0c26d9693140ee8f67 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Mon, 23 Aug 2021 10:31:38 +0300
Subject: [PATCH 035/214] zenith_regression_tests.patch Add alternative output
 for tablespace test, because tablespaces are not supported in zenith yet

---
 src/test/regress/output/tablespace_1.source | 941 ++++++++++++++++++++
 1 file changed, 941 insertions(+)
 create mode 100644 src/test/regress/output/tablespace_1.source

diff --git a/src/test/regress/output/tablespace_1.source b/src/test/regress/output/tablespace_1.source
new file mode 100644
index 00000000000..1c3b75cb6d1
--- /dev/null
+++ b/src/test/regress/output/tablespace_1.source
@@ -0,0 +1,941 @@
+-- create a tablespace using WITH clause
+CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (some_nonexistent_parameter = true); -- fail
+ERROR:  unrecognized parameter "some_nonexistent_parameter"
+CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (random_page_cost = 3.0); -- ok
+-- check to see the parameter was used
+SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith';
+       spcoptions       
+------------------------
+ {random_page_cost=3.0}
+(1 row)
+
+-- drop the tablespace so we can re-use the location
+DROP TABLESPACE regress_tblspacewith;
+-- create a tablespace we can use
+CREATE TABLESPACE regress_tblspace LOCATION '@testtablespace@';
+-- try setting and resetting some properties for the new tablespace
+ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1);
+ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true);  -- fail
+ERROR:  unrecognized parameter "some_nonexistent_parameter"
+ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail
+ERROR:  RESET must not include values for parameters
+ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok
+-- REINDEX (TABLESPACE)
+-- catalogs and system tablespaces
+-- system catalog, fail
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_am;
+ERROR:  cannot move system relation "pg_am_name_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am;
+ERROR:  cannot reindex system catalogs concurrently
+-- shared catalog, fail
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid;
+ERROR:  cannot move system relation "pg_authid_rolname_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid;
+ERROR:  cannot reindex system catalogs concurrently
+-- toast relations, fail
+REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index;
+ERROR:  cannot move system relation "pg_toast_1260_index"
+REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index;
+ERROR:  cannot reindex system catalogs concurrently
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260;
+ERROR:  cannot move system relation "pg_toast_1260_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260;
+ERROR:  cannot reindex system catalogs concurrently
+-- system catalog, fail
+REINDEX (TABLESPACE pg_global) TABLE pg_authid;
+ERROR:  cannot move system relation "pg_authid_rolname_index"
+REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid;
+ERROR:  cannot reindex system catalogs concurrently
+-- table with toast relation
+CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text);
+INSERT INTO regress_tblspace_test_tbl (num1, num2, t)
+  SELECT round(random()*100), random(), 'text'
+  FROM generate_series(1, 10) s(i);
+CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1);
+-- move to global tablespace, fail
+REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx;
+ERROR:  only shared relations can be placed in pg_global tablespace
+REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx;
+ERROR:  cannot move non-shared relation to tablespace "pg_global"
+-- check transactional behavior of REINDEX (TABLESPACE)
+BEGIN;
+REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx;
+REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl;
+ROLLBACK;
+-- no relation moved to the new tablespace
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace';
+ relname 
+---------
+(0 rows)
+
+-- check that all indexes are moved to a new tablespace with different
+-- relfilenode.
+-- Save first the existing relfilenode for the toast and main relations.
+SELECT relfilenode as main_filenode FROM pg_class
+  WHERE relname = 'regress_tblspace_test_tbl_idx' \gset
+SELECT relfilenode as toast_filenode FROM pg_class
+  WHERE oid =
+    (SELECT i.indexrelid
+       FROM pg_class c,
+            pg_index i
+       WHERE i.indrelid = c.reltoastrelid AND
+             c.relname = 'regress_tblspace_test_tbl') \gset
+REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace;
+ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+-- Move back to the default tablespace.
+ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+ relname 
+---------
+(0 rows)
+
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+SELECT relfilenode = :main_filenode AS main_same FROM pg_class
+  WHERE relname = 'regress_tblspace_test_tbl_idx';
+ main_same 
+-----------
+ f
+(1 row)
+
+SELECT relfilenode = :toast_filenode as toast_same FROM pg_class
+  WHERE oid =
+    (SELECT i.indexrelid
+       FROM pg_class c,
+            pg_index i
+       WHERE i.indrelid = c.reltoastrelid AND
+             c.relname = 'regress_tblspace_test_tbl');
+ toast_same 
+------------
+ f
+(1 row)
+
+DROP TABLE regress_tblspace_test_tbl;
+-- REINDEX (TABLESPACE) with partitions
+-- Create a partition tree and check the set of relations reindexed
+-- with their new tablespace.
+CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1);
+CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part
+  FOR VALUES FROM (0) TO (10) PARTITION BY list (c2);
+CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0
+  FOR VALUES IN (1);
+CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0
+  FOR VALUES IN (2);
+-- This partitioned table will have no partitions.
+CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part
+   FOR VALUES FROM (10) TO (20) PARTITION BY list (c2);
+-- Create some partitioned indexes
+CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1);
+CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1);
+ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0;
+-- This partitioned index will have no partitions.
+CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1);
+ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10;
+CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1);
+ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1;
+CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1);
+ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2;
+SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index')
+  ORDER BY relid, level;
+             relid              |         parentrelid          | level 
+--------------------------------+------------------------------+-------
+ tbspace_reindex_part_index     |                              |     0
+ tbspace_reindex_part_index_0   | tbspace_reindex_part_index   |     1
+ tbspace_reindex_part_index_10  | tbspace_reindex_part_index   |     1
+ tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 |     2
+ tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 |     2
+(5 rows)
+
+-- Track the original tablespace, relfilenode and OID of each index
+-- in the tree.
+CREATE TEMP TABLE reindex_temp_before AS
+  SELECT oid, relname, relfilenode, reltablespace
+  FROM pg_class
+    WHERE relname ~ 'tbspace_reindex_part_index';
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part;
+-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check
+-- based on the relation name below.
+SELECT b.relname,
+       CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged'
+       ELSE 'relfilenode has changed' END AS filenode,
+       CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged'
+       ELSE 'reltablespace has changed' END AS tbspace
+  FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname
+  ORDER BY 1;
+            relname             |         filenode         |          tbspace           
+--------------------------------+--------------------------+----------------------------
+ tbspace_reindex_part_index     | relfilenode is unchanged | reltablespace is unchanged
+ tbspace_reindex_part_index_0   | relfilenode is unchanged | reltablespace is unchanged
+ tbspace_reindex_part_index_0_1 | relfilenode has changed  | reltablespace has changed
+ tbspace_reindex_part_index_0_2 | relfilenode has changed  | reltablespace has changed
+ tbspace_reindex_part_index_10  | relfilenode is unchanged | reltablespace is unchanged
+(5 rows)
+
+DROP TABLE tbspace_reindex_part;
+-- create a schema we can use
+CREATE SCHEMA testschema;
+-- try a table
+CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'foo';
+ relname |     spcname      
+---------+------------------
+ foo     | regress_tblspace
+(1 row)
+
+INSERT INTO testschema.foo VALUES(1);
+INSERT INTO testschema.foo VALUES(2);
+-- tables from dynamic sources
+CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'asselect';
+ relname  |     spcname      
+----------+------------------
+ asselect | regress_tblspace
+(1 row)
+
+PREPARE selectsource(int) AS SELECT $1;
+CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace
+    AS EXECUTE selectsource(2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'asexecute';
+  relname  |     spcname      
+-----------+------------------
+ asexecute | regress_tblspace
+(1 row)
+
+-- index
+CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'foo_idx';
+ relname |     spcname      
+---------+------------------
+ foo_idx | regress_tblspace
+(1 row)
+
+-- check \d output
+\d testschema.foo
+              Table "testschema.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ i      | integer |           |          | 
+Indexes:
+    "foo_idx" btree (i), tablespace "regress_tblspace"
+Tablespace: "regress_tblspace"
+
+\d testschema.foo_idx
+      Index "testschema.foo_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ i      | integer | yes  | i
+btree, for table "testschema.foo"
+Tablespace: "regress_tblspace"
+
+--
+-- partitioned table
+--
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+ERROR:  only shared relations can be placed in pg_global tablespace
+RESET default_tablespace;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+SET default_tablespace TO regress_tblspace;
+CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER TABLE testschema.part SET TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4)
+  TABLESPACE pg_default;
+CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6)
+  PARTITION BY LIST (a);
+ALTER TABLE testschema.part SET TABLESPACE pg_default;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+ERROR:  only shared relations can be placed in pg_global tablespace
+CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10)
+  PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+RESET default_tablespace;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+SELECT relname, spcname FROM pg_catalog.pg_class c
+    JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid)
+    LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
+    where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname;
+ relname  |     spcname      
+----------+------------------
+ part     | 
+ part_1   | 
+ part_2   | regress_tblspace
+ part_3   | regress_tblspace
+ part_4   | 
+ part_56  | regress_tblspace
+ part_78  | 
+ part_910 | regress_tblspace
+(8 rows)
+
+RESET default_tablespace;
+DROP TABLE testschema.part;
+-- partitioned index
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);
+CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
+   relname   |     spcname      
+-------------+------------------
+ part1_a_idx | regress_tblspace
+ part2_a_idx | regress_tblspace
+ part_a_idx  | regress_tblspace
+(3 rows)
+
+\d testschema.part
+        Partitioned table "testschema.part"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: LIST (a)
+Indexes:
+    "part_a_idx" btree (a), tablespace "regress_tblspace"
+Number of partitions: 2 (Use \d+ to list them.)
+
+\d+ testschema.part
+                           Partitioned table "testschema.part"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Indexes:
+    "part_a_idx" btree (a), tablespace "regress_tblspace"
+Partitions: testschema.part1 FOR VALUES IN (1),
+            testschema.part2 FOR VALUES IN (2)
+
+\d testschema.part1
+             Table "testschema.part1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: testschema.part FOR VALUES IN (1)
+Indexes:
+    "part1_a_idx" btree (a), tablespace "regress_tblspace"
+
+\d+ testschema.part1
+                                 Table "testschema.part1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition of: testschema.part FOR VALUES IN (1)
+Partition constraint: ((a IS NOT NULL) AND (a = 1))
+Indexes:
+    "part1_a_idx" btree (a), tablespace "regress_tblspace"
+
+\d testschema.part_a_idx
+Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.part"
+Number of partitions: 2 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d+ testschema.part_a_idx
+           Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition | Storage | Stats target 
+--------+---------+------+------------+---------+--------------
+ a      | integer | yes  | a          | plain   | 
+btree, for table "testschema.part"
+Partitions: testschema.part1_a_idx,
+            testschema.part2_a_idx
+Tablespace: "regress_tblspace"
+
+-- partitioned rels cannot specify the default tablespace.  These fail:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+SET default_tablespace TO 'pg_default';
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+-- but these work:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+SET default_tablespace TO '';
+CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
+DROP TABLE testschema.dflt, testschema.dflt2;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
+INSERT INTO testschema.test_default_tab VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab (id);
+CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab;
+ id 
+----
+  1
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
+\d testschema.test_index1
+    Index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+    Index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab;
+ id 
+----
+  1
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
+\d testschema.test_index1
+    Index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+    Index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+-- (this time with a partitioned table)
+CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint)
+    PARTITION BY LIST (id) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p
+    FOR VALUES IN (1);
+INSERT INTO testschema.test_default_tab_p VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
+CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab_p;
+-- check that default_tablespace affects index additions in ALTER TABLE
+CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace;
+INSERT INTO testschema.test_tab VALUES (1);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id);
+SET default_tablespace TO '';
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_pkey
+   Index "testschema.test_tab_pkey"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_tab"
+
+SELECT * FROM testschema.test_tab;
+ id 
+----
+  1
+(1 row)
+
+DROP TABLE testschema.test_tab;
+-- check that default_tablespace is handled correctly by multi-command
+-- ALTER TABLE that includes a tablespace-preserving rewrite
+CREATE TABLE testschema.test_tab(a int, b int, c int);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a);
+CREATE INDEX test_tab_a_idx ON testschema.test_tab (a);
+SET default_tablespace TO '';
+CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ b      | integer | yes  | b
+btree, for table "testschema.test_tab"
+
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ b      | bigint | yes  | b
+btree, for table "testschema.test_tab"
+
+DROP TABLE testschema.test_tab;
+-- let's try moving a table from one place to another
+CREATE TABLE testschema.atable AS VALUES (1), (2);
+CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
+ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global;
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
+INSERT INTO testschema.atable VALUES(3);	-- ok
+INSERT INTO testschema.atable VALUES(1);	-- fail (checks index)
+ERROR:  duplicate key value violates unique constraint "anindex"
+DETAIL:  Key (column1)=(1) already exists.
+SELECT COUNT(*) FROM testschema.atable;		-- checks heap
+ count 
+-------
+     3
+(1 row)
+
+-- Will fail with bad path
+CREATE TABLESPACE regress_badspace LOCATION '/no/such/location';
+ERROR:  directory "/no/such/location" does not exist
+-- No such tablespace
+CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace;
+ERROR:  tablespace "regress_nosuchspace" does not exist
+-- Fail, in use for some partitioned object
+DROP TABLESPACE regress_tblspace;
+ERROR:  tablespace "regress_tblspace" cannot be dropped because some objects depend on it
+DETAIL:  tablespace for index testschema.part_a_idx
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+-- Fail, not empty
+DROP TABLESPACE regress_tblspace;
+CREATE ROLE regress_tablespace_user1 login;
+CREATE ROLE regress_tablespace_user2 login;
+GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2;
+ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1;
+ERROR:  tablespace "regress_tblspace" does not exist
+CREATE TABLE testschema.tablespace_acl (c int);
+-- new owner lacks permission to create this index from scratch
+CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace;
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2;
+SET SESSION ROLE regress_tablespace_user2;
+CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint;
+REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+RESET ROLE;
+ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed;
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+-- Should show notice that nothing was done
+ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+-- Should succeed
+DROP TABLESPACE regress_tblspace_renamed;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+DROP SCHEMA testschema CASCADE;
+NOTICE:  drop cascades to 6 other objects
+DETAIL:  drop cascades to table testschema.foo
+drop cascades to table testschema.asselect
+drop cascades to table testschema.asexecute
+drop cascades to table testschema.part
+drop cascades to table testschema.atable
+drop cascades to table testschema.tablespace_acl
+DROP ROLE regress_tablespace_user1;
+DROP ROLE regress_tablespace_user2;

From 196856598f72d4d44b843d9e33cbd56c1b4f9b43 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 24 Aug 2021 12:37:10 +0300
Subject: [PATCH 036/214] Add test function to flush the shared buffer cache.

---
 .../zenith_test_utils--1.0.sql                |  6 ++
 contrib/zenith_test_utils/zenithtest.c        | 69 +++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
index 6c8fe6521cf..dbf18288fd4 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -6,3 +6,9 @@ RETURNS VOID
 AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
+
+CREATE FUNCTION clear_buffer_cache()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'clear_buffer_cache'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index a7eb278a09b..2d42110cf36 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -12,11 +12,14 @@
 #include "fmgr.h"
 
 #include "access/xact.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
 
 
 PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(clear_buffer_cache);
 
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
@@ -48,3 +51,69 @@ test_consume_xids(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+/*
+ * Flush the buffer cache, evicting all pages that are not currently pinned.
+ */
+Datum
+clear_buffer_cache(PG_FUNCTION_ARGS)
+{
+	bool		save_zenith_test_evict;
+
+	/*
+	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
+	 * buffers, as there is no explicit "evict this buffer" function in the
+	 * buffer manager.
+	 */
+	save_zenith_test_evict = zenith_test_evict;
+	zenith_test_evict = true;
+	PG_TRY();
+	{
+		/* Scan through all the buffers */
+		for (int i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *bufHdr;
+			uint32		buf_state;
+			Buffer		bufferid;
+			bool		isvalid;
+			RelFileNode rnode;
+			ForkNumber	forknum;
+			BlockNumber blocknum;
+
+			/* Peek into the buffer header to see what page it holds. */
+			bufHdr = GetBufferDescriptor(i);
+			buf_state = LockBufHdr(bufHdr);
+
+			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+				isvalid = true;
+			else
+				isvalid = false;
+			bufferid = BufferDescriptorGetBuffer(bufHdr);
+			rnode = bufHdr->tag.rnode;
+			forknum = bufHdr->tag.forkNum;
+			blocknum = bufHdr->tag.blockNum;
+
+			UnlockBufHdr(bufHdr, buf_state);
+
+			/*
+			 * Pin the buffer, and release it again. Because we have
+			 * zenith_test_evict==true, this will evict the page from
+			 * the buffer cache if no one else is holding a pin on it.
+			 */
+			if (isvalid)
+			{
+				if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
+					ReleaseBuffer(bufferid);
+			}
+		}
+	}
+	PG_FINALLY();
+	{
+		/* restore the GUC */
+		zenith_test_evict = save_zenith_test_evict;
+	}
+	PG_END_TRY();
+
+	PG_RETURN_VOID();
+}

From e4f2b1689df266c78642c368a95fdf1810039fca Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 17 Aug 2021 21:16:08 +0300
Subject: [PATCH 037/214] Basic safekeeper refactoring and bug fixing.

On the walproposer side,

- Change the voting flow so that acceptor tells his epoch along with giving
  the vote, not before it; otherwise it might get immediately stale. #294
- Adjust to using separate structs for disk and network.

ref #315
---
 .../libpqwalproposer/libpqwalproposer.c       |   2 +
 src/backend/replication/walproposer.c         | 222 ++++++++++--------
 src/backend/replication/walproposer_utils.c   |  15 +-
 src/include/replication/walproposer.h         | 133 ++++++-----
 4 files changed, 199 insertions(+), 173 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index a5d7fec1a33..63c90f5a54b 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -1,3 +1,5 @@
+#include "postgres.h"
+
 #include "replication/walproposer.h"
 #include "libpq-fe.h"
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 102ce033949..0137cc67b8e 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -4,8 +4,11 @@
  *
  * Broadcast WAL stream to Zenith WAL acceptetors
  */
+#include "postgres.h"
+
 #include <signal.h>
 #include <unistd.h>
+#include "access/xlogdefs.h"
 #include "replication/walproposer.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
@@ -39,12 +42,15 @@ static WalMessage*  msgQueueHead;
 static WalMessage*  msgQueueTail;
 static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
 static XLogRecPtr	lastSentVCLLsn;	/* VCL replies have been sent to walkeeper up to here */
-static ServerInfo   serverInfo;
+static ProposerGreeting   proposerGreeting;
 static WaitEventSet* waitEvents;
-static WalKeeperResponse lastFeedback;
+static AppendResponse lastFeedback;
 static XLogRecPtr   restartLsn; /* Last position received by all walkeepers. */
-static RequestVote  prop;       /* Vote request for walkeeper */
-static int          leader;     /* Most advanced walkeeper */
+static VoteRequest voteRequest; /* Vote request for walkeeper */
+static term_t       propTerm; /* term of the proposer */
+static XLogRecPtr   propVcl;    /* VCL of the proposer */
+static term_t		donorEpoch; /* Most advanced acceptor epoch */
+static int          donor;     /* Most advanced acceptor */
 static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
@@ -187,6 +193,7 @@ ShutdownConnection(int i, bool remove_event)
 	walkeeper[i].state = SS_OFFLINE;
 	walkeeper[i].pollState = SPOLL_NONE;
 	walkeeper[i].sockWaitState = WANTS_NO_WAIT;
+	walkeeper[i].currMsg = NULL;
 
 	if (remove_event)
 		HackyRemoveWalProposerEvent(i);
@@ -281,8 +288,14 @@ GetAcknowledgedByQuorumWALPosition(void)
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		responses[i] = walkeeper[i].feedback.epoch == prop.epoch
-			? walkeeper[i].feedback.flushLsn : prop.VCL;
+		/*
+		 * Note that while we haven't pushed WAL up to VCL to the majority we
+		 * don't really know which LSN is reliably committed as reported
+		 * flush_lsn is physical end of wal, which can contain diverged
+		 * history (compared to donor).
+		 */
+		responses[i] = walkeeper[i].feedback.epoch == propTerm
+			? walkeeper[i].feedback.flushLsn : 0;
 	}
 	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
 
@@ -302,6 +315,7 @@ HandleWalKeeperResponse(void)
 	if (minQuorumLsn > lastFeedback.flushLsn)
 	{
 		lastFeedback.flushLsn = minQuorumLsn;
+		/* advance the replication slot */
 		ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
 	}
 	CombineHotStanbyFeedbacks(&hsFeedback);
@@ -326,7 +340,7 @@ HandleWalKeeperResponse(void)
 			Assert(restartLsn < msg->req.endLsn);
 			restartLsn = msg->req.endLsn;
 		}
-		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(WalKeeperRequest));
+		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
 	if (!msgQueueHead) /* queue is empty */
@@ -395,26 +409,24 @@ WalProposerMain(Datum main_arg)
 
 	GetXLogReplayRecPtr(&ThisTimeLineID);
 
-	/* Fill information about server */
-	serverInfo.timeline = ThisTimeLineID;
-	serverInfo.walEnd = GetFlushRecPtr();
-	serverInfo.walSegSize = wal_segment_size;
-	serverInfo.pgVersion = PG_VERSION_NUM;
+	/* Fill the greeting package */
+	proposerGreeting.tag = 'g';
+	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
+	proposerGreeting.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
+	proposerGreeting.systemId = GetSystemIdentifier();
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
-	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
+	 !HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
-
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
-	 !HexDecodeString(serverInfo.ztenantid, zenith_tenant_walproposer, 16))
+	 !HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
-
-	serverInfo.protocolVersion = SK_PROTOCOL_VERSION;
-	pg_strong_random(&serverInfo.nodeId.uuid, sizeof(serverInfo.nodeId.uuid));
-	serverInfo.systemId = GetSystemIdentifier();
+	proposerGreeting.timeline = ThisTimeLineID;
+	proposerGreeting.walSegSize = wal_segment_size;
 
 	last_reconnect_attempt = GetCurrentTimestamp();
 
@@ -448,7 +460,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	elog(LOG, "WAL proposer starts streaming at %X/%X",
 		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = serverInfo.timeline;
+	cmd.timeline = proposerGreeting.timeline;
 	cmd.startpoint = startpos;
 	StartReplication(&cmd);
 }
@@ -461,15 +473,17 @@ SendMessageToNode(int i, WalMessage* msg)
 {
 	WalKeeper* wk = &walkeeper[i];
 
-	/* If there is no pending message then send new one */
-	if (wk->currMsg == NULL)
-	{
-		/* Skip already acknowledged messages */
-		while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
-			msg = msg->next;
+	/* we shouldn't be already sending something */
+	Assert(wk->currMsg == NULL);
+	/*
+	 * Skip already acknowledged messages. Used during start to get to the
+	 * first not yet received message. Otherwise we always just send
+	 * 'msg'.
+	 */
+	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
+		msg = msg->next;
 
-		wk->currMsg = msg;
-	}
+	wk->currMsg = msg;
 
 	/* Only try to send the message if it's non-null */
 	if (wk->currMsg)
@@ -530,12 +544,15 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 		msgQueueHead = msg;
 	msgQueueTail = msg;
 
-	msg->size = sizeof(WalKeeperRequest) + len;
+	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
 	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.vcl = propVcl;
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
-	msg->req.senderId = prop.nodeId;
+	msg->req.proposerId = proposerGreeting.proposerId;
 	memcpy(&msg->req+1, data + XLOG_HDR_SIZE, len);
 
 	Assert(msg->req.endLsn >= lastSentLsn);
@@ -574,64 +591,56 @@ CreateMessageVCLOnly(void)
 		msgQueueHead = msg;
 	msgQueueTail = msg;
 
-	msg->size = sizeof(WalKeeperRequest);
+	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
 	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.vcl = propVcl;
 	msg->req.beginLsn = lastSentLsn;
 	msg->req.endLsn = lastSentLsn;
-	msg->req.senderId = prop.nodeId;
+	msg->req.proposerId = proposerGreeting.proposerId;
 	/* restartLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
 	return msg;
 }
 
 
 /*
- * Prepare vote request for election
+ * Called after majority of acceptors gave votes, it calculates the most
+ * advanced safekeeper (who will be the donor) and VCL -- LSN since which we'll
+ * write WAL in our term.
+ * Sets restartLsn along the way (though it is not of much use at this point).
  */
 static void
-StartElection(void)
+DetermineVCL(void)
 {
 	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
-	XLogRecPtr initWALPos = serverInfo.walSegSize;
-	prop.VCL = restartLsn = initWALPos;
-	prop.nodeId = serverInfo.nodeId;
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].state == SS_VOTING)
-		{
-			prop.nodeId.term = Max(walkeeper[i].info.server.nodeId.term, prop.nodeId.term);
-			restartLsn = Max(walkeeper[i].info.restartLsn, restartLsn);
-			if (walkeeper[i].info.epoch > prop.epoch
-				|| (walkeeper[i].info.epoch == prop.epoch && walkeeper[i].info.flushLsn > prop.VCL))
+	propVcl = wal_segment_size;
+	donorEpoch = 0;
+	restartLsn = wal_segment_size;
 
-			{
-				prop.epoch = walkeeper[i].info.epoch;
-				prop.VCL = walkeeper[i].info.flushLsn;
-				leader = i;
-			}
-		}
-	}
-	/* Only walkeepers from most recent epoch can report it's FlushLsn to master */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_VOTING)
+		if (walkeeper[i].state == SS_IDLE)
 		{
-			if (walkeeper[i].info.epoch == prop.epoch)
+			if (walkeeper[i].voteResponse.epoch > donorEpoch ||
+				(walkeeper[i].voteResponse.epoch == donorEpoch &&
+				 walkeeper[i].voteResponse.flushLsn > propVcl))
 			{
-				walkeeper[i].feedback.flushLsn = walkeeper[i].info.flushLsn;
-			}
-			else
-			{
-				elog(WARNING, "WalKeeper %s:%s belongs to old epoch " INT64_FORMAT " while current epoch is " INT64_FORMAT,
-					walkeeper[i].host,
-					walkeeper[i].port,
-					walkeeper[i].info.epoch,
-					prop.epoch);
+				donorEpoch = walkeeper[i].voteResponse.epoch;
+				propVcl = walkeeper[i].voteResponse.flushLsn;
+				donor = i;
 			}
+			restartLsn = Max(walkeeper[i].voteResponse.restartLsn, restartLsn);
 		}
 	}
-	prop.nodeId.term += 1;
-	prop.epoch += 1;
+
+	elog(LOG, "got votes from majority (%d) of nodes, VCL %X/%X, donor %s:%s, restart_lsn %X/%X",
+		 quorum,
+		 LSN_FORMAT_ARGS(propVcl),
+		 walkeeper[donor].host, walkeeper[donor].port,
+		 LSN_FORMAT_ARGS(restartLsn)
+		);
 }
 
 /*
@@ -675,7 +684,7 @@ ReconnectWalKeepers(void)
  * Receive WAL from most advanced WAL keeper
  */
 static bool
-WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
 {
 	char conninfo[MAXCONNINFO];
 	char *err;
@@ -683,18 +692,18 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	WalRcvStreamOptions options;
 
 	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
-			walkeeper[leader].host, walkeeper[leader].port, zenith_timeline_walproposer);
+			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
 				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						walkeeper[leader].host, walkeeper[leader].port,
+						walkeeper[donor].host, walkeeper[donor].port,
 						err)));
 		return false;
 	}
 	elog(LOG, "Start recovery from %s:%s starting from %X/%08X till %X/%08X timeline %d",
-		 walkeeper[leader].host, walkeeper[leader].port,
+		 walkeeper[donor].host, walkeeper[donor].port,
 		 (uint32)(startpos>>32), (uint32)startpos, (uint32)(endpos >> 32), (uint32)endpos,
 		 timeline);
 
@@ -736,7 +745,7 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 		{
 			for (WalMessage* msg = msgQueueHead; msg != NULL; msg = msg->next)
 			{
-				if (msg->req.endLsn <= walkeeper[i].info.flushLsn)
+				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
 				{
 					msg->ackMask |= 1 << i; /* message is already received by this walkeeper */
 				}
@@ -1142,7 +1151,7 @@ AdvancePollState(int i, uint32 events)
 				/* Note: This state corresponds to the process of sending the relevant information
 				 * along. The moment we finish sending, we use SS_HANDSHAKE_RECV to complete the
 				 * handshake. */
-				switch (walprop_async_write(wk->conn, &serverInfo, sizeof(serverInfo)))
+				switch (walprop_async_write(wk->conn, &proposerGreeting, sizeof(proposerGreeting)))
 				{
 					case PG_ASYNC_WRITE_SUCCESS:
 						/* If the write immediately succeeds, we can move on to the next state. */
@@ -1183,24 +1192,20 @@ AdvancePollState(int i, uint32 events)
 			case SS_HANDSHAKE_RECV:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->info, sizeof(wk->info)))
+				if (!ReadPGAsyncIntoValue(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
-				/* Check protocol version */
-				if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
-				{
-					elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
-							wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
-					ResetConnection(i);
-					return;
-				}
-
-				/* Protocol is all good, move to voting */
 				wk->state     = SS_VOTING;
 				wk->pollState = SPOLL_IDLE;
 				wk->feedback.flushLsn = restartLsn;
 				wk->feedback.hs.ts = 0;
 
+				/*
+				 * We want our term to be highest and unique, so choose max
+				 * and +1 once we have majority.
+				 */
+				propTerm = Max(walkeeper[i].greet.term, propTerm);
+
 				/* Check if we have quorum. If there aren't enough walkeepers, wait and do nothing.
 				 * We'll eventually get a task when the election starts.
 				 *
@@ -1208,9 +1213,17 @@ AdvancePollState(int i, uint32 events)
 				if (++n_connected >= quorum)
 				{
 					if (n_connected == quorum)
-						StartElection();
+					{
+						propTerm++;
+						/* prepare voting message */
+						voteRequest = (VoteRequest) {
+							.tag = 'v',
+							.term = propTerm
+						};
+						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+					}
 
-					/* Now send max-node-id to everyone participating in voting and wait their responses */
+					/* Now send voting request to the cohort and wait responses */
 					for (int j = 0; j < n_walkeepers; j++)
 					{
 						/* Remember: SS_VOTING indicates that the walkeeper is participating in
@@ -1240,7 +1253,7 @@ AdvancePollState(int i, uint32 events)
 
 			/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
-				switch (walprop_async_write(wk->conn, &prop, sizeof(prop)))
+				switch (walprop_async_write(wk->conn, &voteRequest, sizeof(voteRequest)))
 				{
 					case PG_ASYNC_WRITE_SUCCESS:
 						/* If the write immediately succeeds, we can move on to the next state. */
@@ -1278,16 +1291,24 @@ AdvancePollState(int i, uint32 events)
 			case SS_WAIT_VERDICT:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->info.server.nodeId, sizeof(wk->info.server.nodeId)))
+				if (!ReadPGAsyncIntoValue(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
-				/* If server accept our candidate, then it returns it in response */
-				if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+
+				/*
+				 * In case of acceptor rejecting our vote, bail out, but only if
+				 * either it already lives in strictly higher term (concurrent
+				 * compute spotted) or we are not elected yet and thus need the
+				 * vote.
+				 */
+				if ((!wk->voteResponse.voteGiven) &&
+					(wk->voteResponse.term > propTerm || n_votes < quorum))
 				{
-					elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
 						wk->host, wk->port,
-						wk->info.server.nodeId.term, prop.nodeId.term);
+						wk->voteResponse.term, propTerm);
 				}
+				Assert(wk->voteResponse.term == propTerm);
 
 				/* Handshake completed, do we have quorum? */
 				wk->state         = SS_IDLE;
@@ -1296,21 +1317,18 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes == quorum)
 				{
-					elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
-						 quorum,
-						 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
-						);
+					DetermineVCL();
 
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-					if (restartLsn < prop.VCL)
+					if (restartLsn < propVcl)
 					{
-						elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
-							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
+						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
+							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(propVcl));
 						/* Perform recovery */
-						if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+						if (!WalProposerRecovery(donor, proposerGreeting.timeline, restartLsn, propVcl))
 							elog(FATAL, "Failed to recover state");
 					}
-					WalProposerStartStreaming(prop.VCL);
+					WalProposerStartStreaming(propVcl);
 					/* Should not return here */
 				}
 				else
@@ -1331,7 +1349,7 @@ AdvancePollState(int i, uint32 events)
 				if (wk->pollState != SPOLL_RETRY)
 				{
 					elog(LOG, "Sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %s:%s",
-						 msg->size - sizeof(WalKeeperRequest),
+						 msg->size - sizeof(AppendRequestHeader),
 						 LSN_FORMAT_ARGS(msg->req.commitLsn),
 						 LSN_FORMAT_ARGS(restartLsn),
 						 wk->host, wk->port);
@@ -1464,7 +1482,7 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
 			wk->host, wk->port,
 			FormatWalKeeperState(wk->state),
-			sizeof(wk->info.server.nodeId), buf_size);
+			value_size, buf_size);
 	}
 
 	/* Copy the resulting info into place */
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 722fa66d5e6..29c209e63c1 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -1,3 +1,5 @@
+#include "postgres.h"
+
 #include "replication/walproposer.h"
 #include "common/logging.h"
 #include "common/ip.h"
@@ -5,16 +7,6 @@
 #include <netinet/tcp.h>
 #include <unistd.h>
 
-int CompareNodeId(NodeId* id1, NodeId* id2)
-{
-	return
-		(id1->term < id2->term)
-		? -1
-		: (id1->term > id2->term)
-		   ? 1
-   		   : memcmp(&id1->uuid, &id1->uuid, sizeof(pg_uuid_t));
-}
-
 int
 CompareLsn(const void *a, const void *b)
 {
@@ -30,7 +22,7 @@ CompareLsn(const void *a, const void *b)
 }
 
 /* Converts a `WKSockWaitKind` into the bit flags that would match it
- * 
+ *
  * Note: For `wait_kind = WANTS_NO_WAIT`, this will return a value of zero,
  * which does not match any events. Attempting to wait on no events will
  * always timeout, so it's best to double-check the value being provided to
@@ -231,4 +223,3 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 
 	return true;
 }
-
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index b7b35e876e5..3f03f43eb2a 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -1,6 +1,7 @@
 #ifndef __WALKEEPER_H__
 #define __WALKEEPER_H__
 
+#include "access/xlogdefs.h"
 #include "postgres.h"
 #include "port.h"
 #include "access/xlog_internal.h"
@@ -172,7 +173,7 @@ typedef enum
 } WalKeeperState;
 
 /* WAL safekeeper state - individual level
- * 
+ *
  * This type encompasses the type of polling necessary to move on to the
  * next `WalKeeperState` from the current. It's things like "we need to
  * call PQflush some more", or "retry the current operation".
@@ -251,77 +252,75 @@ typedef enum
 	WANTS_SOCK_EITHER,
 } WKSockWaitKind;
 
-/*
- * Unique node identifier used by Paxos
- */
-typedef struct NodeId
-{
-	uint64     term;
-	pg_uuid_t  uuid;
-} NodeId;
+/* Consensus logical timestamp. */
+typedef uint64 term_t;
 
 /*
- * Information about Postgres server broadcasted by WAL proposer to walkeeper
+ * Proposer -> Acceptor messaging.
  */
-typedef struct ServerInfo
+
+/* Initial Proposer -> Acceptor message */
+typedef struct ProposerGreeting
 {
-	uint32     protocolVersion;   /* proposer-walkeeper protocol version */
-	uint32     pgVersion;         /* Postgres server version */
-	NodeId     nodeId;
-	uint64     systemId;          /* Postgres system identifier */
-	uint8	   ztimelineid[16];   /* Zenith timeline id */
-	XLogRecPtr walEnd;
+	uint64	   tag;				  /* message tag */
+	uint32	   protocolVersion;	  /* proposer-walkeeper protocol version */
+	uint32	   pgVersion;
+	pg_uuid_t  proposerId;
+	uint64	   systemId;		  /* Postgres system identifier */
+	uint8	   ztimelineid[16];	  /* Zenith timeline id */
+	uint8	   ztenantid[16];
 	TimeLineID timeline;
-	int        walSegSize;
-	uint8      ztenantid[16];
-} ServerInfo;
+	uint32	   walSegSize;
+} ProposerGreeting;
 
 /*
- * Vote request sent from proposer to walkeepers
+ * Acceptor -> Proposer initial response: the highest term acceptor voted for.
  */
-typedef struct RequestVote
+typedef struct AcceptorGreeting
 {
-	NodeId     nodeId;
-	XLogRecPtr VCL;   /* volume commit LSN */
-	uint64     epoch; /* new epoch when walkeeper reaches VCL */
-} RequestVote;
+	uint64		tag;
+	term_t		term;
+} AcceptorGreeting;
 
 /*
- * Information of about storage node
+ * Proposer -> Acceptor vote request.
  */
-typedef struct WalKeeperInfo
+typedef struct VoteRequest
 {
-	uint32     magic;             /* magic for verifying content the control file */
-	uint32     formatVersion;     /* walkeeper format version */
-	uint64     epoch;             /* walkeeper's epoch */
-	ServerInfo server;
-	XLogRecPtr commitLsn;         /* part of WAL acknowledged by quorum */
-	XLogRecPtr flushLsn;          /* locally flushed part of WAL */
-	XLogRecPtr restartLsn;        /* minimal LSN which may be needed for recovery of some walkeeper: min(commitLsn) for all walkeepers */
-} WalKeeperInfo;
-
-/*
- * Hot standby feedback received from replica
- */
-typedef struct HotStandbyFeedback
-{
-	TimestampTz       ts;
-	FullTransactionId xmin;
-	FullTransactionId catalog_xmin;
-} HotStandbyFeedback;
-
+	uint64		tag;
+	term_t		term;
+	pg_uuid_t   proposerId; /* for monitoring/debugging */
+} VoteRequest;
+
+/* Vote itself, sent from safekeeper to proposer */
+typedef struct VoteResponse {
+	uint64 tag;
+	term_t term; /* not really needed, just adds observability */
+	uint64 voteGiven;
+    /// Safekeeper's log position, to let proposer choose the most advanced one
+	term_t epoch;
+	XLogRecPtr flushLsn;
+	XLogRecPtr restartLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+} VoteResponse;
 
 /*
- * Request with WAL message sent from proposer to walkeeper.
+ * Header of request with WAL message sent from proposer to walkeeper.
  */
-typedef struct WalKeeperRequest
+typedef struct AppendRequestHeader
 {
-	NodeId     senderId;    /* Sender's node identifier (looks like we do not need it for TCP streaming connection) */
+	uint64 tag;
+	term_t term; /* term of the proposer */
+	/*
+	 * LSN since which current proposer appends WAL; determines epoch switch
+	 * point.
+	 */
+	XLogRecPtr vcl;
 	XLogRecPtr beginLsn;    /* start position of message in WAL */
 	XLogRecPtr endLsn;      /* end position of message in WAL */
-	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
 	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
-} WalKeeperRequest;
+	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
+	pg_uuid_t  proposerId; /* for monitoring/debugging */
+} AppendRequestHeader;
 
 /*
  * All copy data message ('w') are linked in L1 send list and asynchronously sent to receivers.
@@ -332,7 +331,7 @@ struct WalMessage
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
 	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
-	WalKeeperRequest req; /* request to walkeeper (message header) */
+	AppendRequestHeader req; /* request to walkeeper (message header) */
 
 	/* PHANTOM FIELD:
 	 *
@@ -341,15 +340,31 @@ struct WalMessage
 	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
 };
 
+/*
+ * Hot standby feedback received from replica
+ */
+typedef struct HotStandbyFeedback
+{
+	TimestampTz       ts;
+	FullTransactionId xmin;
+	FullTransactionId catalog_xmin;
+} HotStandbyFeedback;
+
 /*
  * Report walkeeper state to proposer
  */
-typedef struct WalKeeperResponse
+typedef struct AppendResponse
 {
-	uint64     epoch;
+	/*
+	 * Current term of the safekeeper; if it is higher than proposer's, the
+	 * compute is out of date.
+	 */
+	uint64 tag;
+	term_t     term;
+	term_t     epoch;
 	XLogRecPtr flushLsn;
 	HotStandbyFeedback hs;
-} WalKeeperResponse;
+} AppendResponse;
 
 
 /*
@@ -369,12 +384,12 @@ typedef struct WalKeeper
 	WalKeeperPollState pollState;     /* what kind of polling is necessary to advance `state` */
 	WKSockWaitKind     sockWaitState; /* what state are we expecting the socket to be in for
 									     the polling required? */
-	WalKeeperInfo      info;          /* walkeeper info */
-	WalKeeperResponse  feedback;      /* feedback to master */
+	AcceptorGreeting   greet;         /* acceptor greeting  */
+	VoteResponse	   voteResponse;  /* the vote */
+	AppendResponse  feedback;      /* feedback to master */
 } WalKeeper;
 
 
-int        CompareNodeId(NodeId* id1, NodeId* id2);
 int        CompareLsn(const void *a, const void *b);
 uint32     WaitKindAsEvents(WKSockWaitKind wait_kind);
 char*      FormatWalKeeperState(WalKeeperState state);

From f2a0837d8714496f85e7c601ef1db8fdcdcdc079 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 27 Aug 2021 13:33:53 +0300
Subject: [PATCH 038/214] Rename VCL to epochStartLsn and restart_lsn to
 truncate_lsn.

epochStartLsn is the LSN since which new proposer writes its WAL in its epoch,
let's be more explicit here.

In several places it also actually meant something we call *commit_lsn* -- the
latest lsn known to be reliably commited (it constantly moves within one wal
proposer).

truncate_lsn is LSN still needed by the most lagging safekeeper. restart_lsn is
terminology from pg_replicaton_slots, but here we don't really have 'restart';
hopefully truncate word makes it clearer.
---
 src/backend/replication/walproposer.c | 83 ++++++++++++++-------------
 src/include/replication/walproposer.h | 16 ++++--
 2 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 0137cc67b8e..8e46f52b15f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -41,14 +41,14 @@ static WalKeeper    walkeeper[MAX_WALKEEPERS];
 static WalMessage*  msgQueueHead;
 static WalMessage*  msgQueueTail;
 static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
-static XLogRecPtr	lastSentVCLLsn;	/* VCL replies have been sent to walkeeper up to here */
+static XLogRecPtr	lastSentCommitLsn;	/* last commitLsn broadcast to walkeepers */
 static ProposerGreeting   proposerGreeting;
 static WaitEventSet* waitEvents;
 static AppendResponse lastFeedback;
-static XLogRecPtr   restartLsn; /* Last position received by all walkeepers. */
+static XLogRecPtr   truncateLsn; /* Last position received by all walkeepers. */
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t       propTerm; /* term of the proposer */
-static XLogRecPtr   propVcl;    /* VCL of the proposer */
+static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
 static term_t		donorEpoch; /* Most advanced acceptor epoch */
 static int          donor;     /* Most advanced acceptor */
 static int          n_votes = 0;
@@ -289,7 +289,7 @@ GetAcknowledgedByQuorumWALPosition(void)
 	for (int i = 0; i < n_walkeepers; i++)
 	{
 		/*
-		 * Note that while we haven't pushed WAL up to VCL to the majority we
+		 * Note that while we haven't pushed WAL up to epoch start lsn to the majority we
 		 * don't really know which LSN is reliably committed as reported
 		 * flush_lsn is physical end of wal, which can contain diverged
 		 * history (compared to donor).
@@ -335,10 +335,10 @@ HandleWalKeeperResponse(void)
 	{
 		WalMessage* msg = msgQueueHead;
 		msgQueueHead = msg->next;
-		if (restartLsn < msg->req.beginLsn)
+		if (truncateLsn < msg->req.beginLsn)
 		{
-			Assert(restartLsn < msg->req.endLsn);
-			restartLsn = msg->req.endLsn;
+			Assert(truncateLsn < msg->req.endLsn);
+			truncateLsn = msg->req.endLsn;
 		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
@@ -488,7 +488,7 @@ SendMessageToNode(int i, WalMessage* msg)
 	/* Only try to send the message if it's non-null */
 	if (wk->currMsg)
 	{
-		wk->currMsg->req.restartLsn = restartLsn;
+		wk->currMsg->req.truncateLsn = truncateLsn;
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
 		/* Once we've selected and set up our message, actually start sending it. */
@@ -549,7 +549,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
-	msg->req.vcl = propVcl;
+	msg->req.epochStartLsn = propEpochStartLsn;
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
 	msg->req.proposerId = proposerGreeting.proposerId;
@@ -570,10 +570,10 @@ WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
 
 /*
  * Create WAL message with no data, just to let the walkeepers
- * know that the VCL has advanced.
+ * know that commit lsn has advanced.
  */
 static WalMessage*
-CreateMessageVCLOnly(void)
+CreateMessageCommitLsnOnly(void)
 {
 	/* Create new message and append it to message queue */
 	WalMessage*	msg;
@@ -596,28 +596,29 @@ CreateMessageVCLOnly(void)
 	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
-	msg->req.vcl = propVcl;
+	msg->req.epochStartLsn = propEpochStartLsn;
 	msg->req.beginLsn = lastSentLsn;
 	msg->req.endLsn = lastSentLsn;
 	msg->req.proposerId = proposerGreeting.proposerId;
-	/* restartLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
+	/* truncateLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
 	return msg;
 }
 
 
 /*
  * Called after majority of acceptors gave votes, it calculates the most
- * advanced safekeeper (who will be the donor) and VCL -- LSN since which we'll
- * write WAL in our term.
- * Sets restartLsn along the way (though it is not of much use at this point).
+ * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
+ * which we'll write WAL in our term.
+ * Sets truncateLsn along the way (though it
+ * is not of much use at this point).
  */
 static void
-DetermineVCL(void)
+DetermineEpochStartLsn(void)
 {
 	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
-	propVcl = wal_segment_size;
+	propEpochStartLsn = wal_segment_size;
 	donorEpoch = 0;
-	restartLsn = wal_segment_size;
+	truncateLsn = wal_segment_size;
 
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -625,21 +626,21 @@ DetermineVCL(void)
 		{
 			if (walkeeper[i].voteResponse.epoch > donorEpoch ||
 				(walkeeper[i].voteResponse.epoch == donorEpoch &&
-				 walkeeper[i].voteResponse.flushLsn > propVcl))
+				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
 			{
 				donorEpoch = walkeeper[i].voteResponse.epoch;
-				propVcl = walkeeper[i].voteResponse.flushLsn;
+				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
 				donor = i;
 			}
-			restartLsn = Max(walkeeper[i].voteResponse.restartLsn, restartLsn);
+			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
 		}
 	}
 
-	elog(LOG, "got votes from majority (%d) of nodes, VCL %X/%X, donor %s:%s, restart_lsn %X/%X",
+	elog(LOG, "got votes from majority (%d) of nodes, epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
 		 quorum,
-		 LSN_FORMAT_ARGS(propVcl),
+		 LSN_FORMAT_ARGS(propEpochStartLsn),
 		 walkeeper[donor].host, walkeeper[donor].port,
-		 LSN_FORMAT_ARGS(restartLsn)
+		 LSN_FORMAT_ARGS(truncateLsn)
 		);
 }
 
@@ -1197,7 +1198,7 @@ AdvancePollState(int i, uint32 events)
 
 				wk->state     = SS_VOTING;
 				wk->pollState = SPOLL_IDLE;
-				wk->feedback.flushLsn = restartLsn;
+				wk->feedback.flushLsn = truncateLsn;
 				wk->feedback.hs.ts = 0;
 
 				/*
@@ -1317,18 +1318,18 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes == quorum)
 				{
-					DetermineVCL();
+					DetermineEpochStartLsn();
 
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-					if (restartLsn < propVcl)
+					if (truncateLsn < propEpochStartLsn)
 					{
-						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
-							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(propVcl));
+						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to epochStartLsn=%X/%X",
+							 LSN_FORMAT_ARGS(truncateLsn), LSN_FORMAT_ARGS(propEpochStartLsn));
 						/* Perform recovery */
-						if (!WalProposerRecovery(donor, proposerGreeting.timeline, restartLsn, propVcl))
+						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
 					}
-					WalProposerStartStreaming(propVcl);
+					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
 				}
 				else
@@ -1348,10 +1349,10 @@ AdvancePollState(int i, uint32 events)
 				/* Don't repeat logs if we have to retry the actual send operation itself */
 				if (wk->pollState != SPOLL_RETRY)
 				{
-					elog(LOG, "Sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %s:%s",
+					elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
 						 msg->size - sizeof(AppendRequestHeader),
 						 LSN_FORMAT_ARGS(msg->req.commitLsn),
-						 LSN_FORMAT_ARGS(restartLsn),
+						 LSN_FORMAT_ARGS(truncateLsn),
 						 wk->host, wk->port);
 				}
 
@@ -1386,7 +1387,7 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* next;
 				XLogRecPtr  minQuorumLsn;
-				WalMessage* vclUpdateMsg;
+				WalMessage* commitLsnUpdateMsg;
 
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
@@ -1407,19 +1408,19 @@ AdvancePollState(int i, uint32 events)
 				SendMessageToNode(i, next);
 
 				/*
-				 * Also send the new VCL to all the walkeepers.
+				 * Also send the new commit lsn to all the walkeepers.
 				 *
 				 * FIXME: This is redundant for walkeepers that have other outbound messages
 				 * pending.
 				 */
 				minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 
-				if (minQuorumLsn > lastSentVCLLsn)
+				if (minQuorumLsn > lastSentCommitLsn)
 				{
-					vclUpdateMsg = CreateMessageVCLOnly();
-					if (vclUpdateMsg)
-						BroadcastMessage(vclUpdateMsg);
-					lastSentVCLLsn = minQuorumLsn;
+					commitLsnUpdateMsg = CreateMessageCommitLsnOnly();
+					if (commitLsnUpdateMsg)
+						BroadcastMessage(commitLsnUpdateMsg);
+					lastSentCommitLsn = minQuorumLsn;
 				}
 				break;
 			}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 3f03f43eb2a..af4d877963d 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -300,7 +300,7 @@ typedef struct VoteResponse {
     /// Safekeeper's log position, to let proposer choose the most advanced one
 	term_t epoch;
 	XLogRecPtr flushLsn;
-	XLogRecPtr restartLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
 } VoteResponse;
 
 /*
@@ -311,15 +311,19 @@ typedef struct AppendRequestHeader
 	uint64 tag;
 	term_t term; /* term of the proposer */
 	/*
-	 * LSN since which current proposer appends WAL; determines epoch switch
-	 * point.
+	 * LSN since which current proposer appends WAL (begin_lsn of its first
+	 * record); determines epoch switch point.
 	 */
-	XLogRecPtr vcl;
+	XLogRecPtr epochStartLsn;
 	XLogRecPtr beginLsn;    /* start position of message in WAL */
 	XLogRecPtr endLsn;      /* end position of message in WAL */
 	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
-	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
-	pg_uuid_t  proposerId; /* for monitoring/debugging */
+	/*
+	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
+	 *  + 1 of last record streamed to everyone)
+	 */
+    XLogRecPtr truncateLsn;
+    pg_uuid_t  proposerId; /* for monitoring/debugging */
 } AppendRequestHeader;
 
 /*

From 82dcd2f33902ab4db43b1d8fc0dde8927947c2c1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 3 Aug 2021 19:16:54 +0300
Subject: [PATCH 039/214] [refer #27] Implement shared relsize cache to improve
 zenith performance.

Cache relfilenode size returned by zenith_nblocks() and also update it when relation is extended.
Don't update it from zenith_write() or zenith_wallog_page(), since there is no guarantee that these functions wouldn't be called for some page that is not the last one

It can be configured with zenith.relsize_hash_size GUC parameter.
Set it to 0 to disable caching.
---
 contrib/zenith/Makefile           |   2 +-
 contrib/zenith/libpagestore.c     |   2 +
 contrib/zenith/pagestore_client.h |   7 ++
 contrib/zenith/pagestore_smgr.c   |  10 +-
 contrib/zenith/relsize_cache.c    | 150 ++++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 contrib/zenith/relsize_cache.c

diff --git a/contrib/zenith/Makefile b/contrib/zenith/Makefile
index ad41c55bd71..4b706186fff 100644
--- a/contrib/zenith/Makefile
+++ b/contrib/zenith/Makefile
@@ -4,7 +4,7 @@
 MODULE_big = zenith
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o
+	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 142999a6a8e..b726cee80f8 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -326,6 +326,8 @@ _PG_init(void)
 							 0,
 							 NULL, NULL, NULL);
 
+	relsize_hash_init();
+
 	if (page_server != NULL)
 		zenith_log(ERROR, "libpqpagestore already loaded");
 
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index b4b223d3c46..dbcaa5fdb91 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -149,4 +149,11 @@ extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber nblocks);
 extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
 
+
+/* utils for zenith relsize cache */
+extern void relsize_hash_init(void);
+extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
+extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+
 #endif
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 47a37b0687d..5db79710d68 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -25,6 +25,7 @@
 #include "storage/bufmgr.h"
 #include "fmgr.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "replication/walsender.h"
 #include "catalog/pg_tablespace_d.h"
 
@@ -565,6 +566,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	XLogRecPtr lsn;
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno+1);
 
 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -871,9 +873,12 @@ BlockNumber
 zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	ZenithResponse *resp;
-	int			n_blocks;
+	BlockNumber n_blocks;
 	XLogRecPtr request_lsn;
 
+	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
+		return n_blocks;
+
 	request_lsn = zenith_get_request_lsn(false);
 	resp = page_server->request((ZenithRequest) {
 		.tag = T_ZenithNblocksRequest,
@@ -884,6 +889,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		.lsn = request_lsn
 	});
 	n_blocks = resp->n_blocks;
+	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
 
 	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 		 reln->smgr_rnode.node.spcNode,
@@ -905,6 +911,8 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 	XLogRecPtr lsn;
 
+	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
+
 	/*
 	 * Truncating a relation drops all its buffers from the buffer cache without
 	 * calling smgrwrite() on them. But we must account for that in our tracking
diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
new file mode 100644
index 00000000000..5cb86e116a7
--- /dev/null
+++ b/contrib/zenith/relsize_cache.c
@@ -0,0 +1,150 @@
+/*-------------------------------------------------------------------------
+ *
+ * relsize_cache.c
+ *      Relation size cache for better zentih performance.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  contrib/zenith/relsize_cache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pagestore_client.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/lwlock.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "catalog/pg_tablespace_d.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+
+
+typedef struct
+{
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} RelTag;
+
+typedef struct
+{
+	RelTag tag;
+	BlockNumber size;
+} RelSizeEntry;
+
+static HTAB *relsize_hash;
+static LWLockId relsize_lock;
+static int relsize_hash_size;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+static void
+zenith_smgr_shmem_startup(void)
+{
+	static HASHCTL info;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	relsize_lock = (LWLockId)GetNamedLWLockTranche("zenith_relsize");
+	info.keysize = sizeof(RelTag);
+	info.entrysize = sizeof(RelSizeEntry);
+	relsize_hash = ShmemInitHash("zenith_relsize",
+								 relsize_hash_size, relsize_hash_size,
+								 &info,
+								 HASH_ELEM | HASH_BLOBS);
+	LWLockRelease(AddinShmemInitLock);
+}
+
+bool
+get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size)
+{
+	bool found = false;
+	if (relsize_hash_size > 0)
+	{
+		RelTag tag;
+		RelSizeEntry* entry;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_SHARED);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			*size = entry->size;
+			found = true;
+		}
+		LWLockRelease(relsize_lock);
+	}
+	return found;
+}
+
+void
+set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag tag;
+		RelSizeEntry* entry;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
+		entry->size = size;
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag tag;
+		RelSizeEntry* entry;
+		bool found;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
+		if (!found || entry->size < size)
+			entry->size = size;
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+relsize_hash_init(void)
+{
+	DefineCustomIntVariable("zenith.relsize_hash_size",
+							"Sets the maximum number of cached relation sizes for zenith",
+							NULL,
+							&relsize_hash_size,
+							/* 
+							 * Size of cache entry is 20 bytes.
+							 * So 64 entry will take about 1.2 Mb,
+							 * which seems to be a reasonable default.
+							 */
+							64*1024,
+							0,
+							INT_MAX,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL,	NULL);
+
+	if (relsize_hash_size > 0)
+	{
+		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
+		RequestNamedLWLockTranche("zenith_relsize", 1);
+
+		prev_shmem_startup_hook = shmem_startup_hook;
+		shmem_startup_hook = zenith_smgr_shmem_startup;
+	}
+}
\ No newline at end of file

From 574ba5d24220f47a02bdce5ac09a2fe92a1d9ed4 Mon Sep 17 00:00:00 2001
From: Max Sharnoff <github@max.sharnoff.org>
Date: Tue, 31 Aug 2021 13:05:39 -0700
Subject: [PATCH 040/214] Cleanup walproposer changes from #60

Closes #66. Mostly corresponds to cleaning up the states we store. Goes
back to single states for each WalKeeper, and we perform blocking writes
for everything but sending the WAL itself.

A few things have been factored out into libpqwalproposer for
simplicity - like handling the nonblocking status of the connection
(even though it's only changed once).
---
 .../libpqwalproposer/libpqwalproposer.c       | 119 ++-
 src/backend/replication/walproposer.c         | 815 +++++++-----------
 src/backend/replication/walproposer_utils.c   | 137 ++-
 src/include/replication/walproposer.h         | 257 ++----
 4 files changed, 575 insertions(+), 753 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 63c90f5a54b..1b8a53b5066 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -11,6 +11,7 @@ void _PG_init(void);
 struct WalProposerConn
 {
 	PGconn* pg_conn;
+	bool    is_nonblocking; /* whether the connection is non-blocking */
 };
 
 /* Prototypes for exported functions */
@@ -20,13 +21,12 @@ static WalProposerConn*					libpqprop_connect_start(char* conninfo);
 static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn* conn);
 static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
 static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
-static int								libpqprop_set_nonblocking(WalProposerConn* conn, int arg);
 static pgsocket							libpqprop_socket(WalProposerConn* conn);
-static int								libpqprop_flush(WalProposerConn* conn);
-static int								libpqprop_consume_input(WalProposerConn* conn);
+static int								libpqprop_flush(WalProposerConn* conn, bool socket_read_ready);
 static void								libpqprop_finish(WalProposerConn* conn);
 static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
 static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
+static bool                             libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size);
 
 static WalProposerFunctionsType PQWalProposerFunctions = {
 	libpqprop_error_message,
@@ -35,13 +35,12 @@ static WalProposerFunctionsType PQWalProposerFunctions = {
 	libpqprop_connect_poll,
 	libpqprop_send_query,
 	libpqprop_get_query_result,
-	libpqprop_set_nonblocking,
 	libpqprop_socket,
 	libpqprop_flush,
-	libpqprop_consume_input,
 	libpqprop_finish,
 	libpqprop_async_read,
 	libpqprop_async_write,
+	libpqprop_blocking_write,
 };
 
 /* Module initialization */
@@ -53,6 +52,22 @@ _PG_init(void)
 	WalProposerFunctions = &PQWalProposerFunctions;
 }
 
+/* Helper function */
+static bool
+ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking)
+{
+	/* If we're already correctly blocking or nonblocking, all good */
+	if (is_nonblocking == conn->is_nonblocking)
+		return true;
+
+	/* Otherwise, set it appropriately */
+	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
+		return false;
+
+	conn->is_nonblocking = is_nonblocking;
+	return true;
+}
+
 /* Exported function definitions */
 static char*
 libpqprop_error_message(WalProposerConn* conn)
@@ -96,6 +111,7 @@ libpqprop_connect_start(char* conninfo)
 	 */
 	conn = palloc(sizeof(WalProposerConn));
 	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false; /* connections always start in blocking mode */
 	return conn;
 }
 
@@ -133,22 +149,16 @@ libpqprop_connect_poll(WalProposerConn* conn)
 static bool
 libpqprop_send_query(WalProposerConn* conn, char* query)
 {
-	int  result;
-	bool return_val;
+	/* We need to be in blocking mode for sending the query to run without
+	 * requiring a call to PQflush */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
 
-	switch ((result = PQsendQuery(conn->pg_conn, query)))
-	{
-		case 0:
-			return_val = false;
-			break;
-		case 1:
-			return_val = true;
-			break;
-		default:
-			elog(FATAL, "unexpected return %d from PQsendQuery", result);
-	}
+	/* PQsendQuery returns 1 on success, 0 on failure */
+	if (!PQsendQuery(conn->pg_conn, query))
+		return false;
 
-	return return_val;
+	return true;
 }
 
 static WalProposerExecStatusType
@@ -160,6 +170,10 @@ libpqprop_get_query_result(WalProposerConn* conn)
 	/* Marker variable if we need to log an unexpected success result */
 	char* unexpected_success = NULL;
 
+	/* Consume any input that we might be missing */
+	if (!PQconsumeInput(conn->pg_conn))
+		return WP_EXEC_FAILED;
+
 	if (PQisBusy(conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;
 
@@ -218,12 +232,6 @@ libpqprop_get_query_result(WalProposerConn* conn)
 	return return_val;
 }
 
-static int
-libpqprop_set_nonblocking(WalProposerConn* conn, int arg)
-{
-	return PQsetnonblocking(conn->pg_conn, arg);
-}
-
 static pgsocket
 libpqprop_socket(WalProposerConn* conn)
 {
@@ -231,15 +239,14 @@ libpqprop_socket(WalProposerConn* conn)
 }
 
 static int
-libpqprop_flush(WalProposerConn* conn)
+libpqprop_flush(WalProposerConn* conn, bool socket_read_ready)
 {
-	return (PQflush(conn->pg_conn));
-}
+	/* If the socket is read-ready, we have to call PQconsumeInput before
+	 * calling PQflush (according to libpq docs) */
+	if (socket_read_ready && !PQconsumeInput(conn->pg_conn))
+		return -1; /* return failure if PQconsumeInput fails */
 
-static int
-libpqprop_consume_input(WalProposerConn* conn)
-{
-	return (PQconsumeInput(conn->pg_conn));
+	return (PQflush(conn->pg_conn));
 }
 
 static void
@@ -254,6 +261,10 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 {
 	int result;
 
+	/* Call PQconsumeInput so that we have the data we need */
+	if (!PQconsumeInput(conn->pg_conn))
+		return PG_ASYNC_READ_FAIL;
+
 	/* The docs for PQgetCopyData list the return values as:
 	 *      0 if the copy is still in progress, but no "complete row" is
 	 *        available
@@ -267,7 +278,7 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
 	{
 		case 0:
-			return PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN;
+			return PG_ASYNC_READ_TRY_AGAIN;
 		case -1:
 			/* As mentioned above; this shouldn't happen */
 			elog(FATAL, "unexpected return -1 from PQgetCopyData");
@@ -286,21 +297,26 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
 {
 	int result;
 
+	/* If we aren't in non-blocking mode, switch to it. */
+	if (!ensure_nonblocking_status(conn, true))
+		return PG_ASYNC_WRITE_FAIL;
+
 	/* The docs for PQputcopyData list the return values as:
 	 *   1 if the data was queued,
 	 *   0 if it was not queued because of full buffers, or
 	 *  -1 if an error occured
 	 */
-	switch (result = PQputCopyData(conn->pg_conn, buf, size))
+	result = PQputCopyData(conn->pg_conn, buf, size);
+
+	/* We won't get a result of zero because walproposer always empties the
+	 * connection's buffers before sending more */
+	Assert(result != 0);
+
+	switch (result)
 	{
 		case 1:
 			/* good -- continue */
 			break;
-		case 0:
-			/* FIXME: can this ever happen? the structure of walproposer
-			 * should always empty the connection's buffers before trying
-			 * to send more, right? */
-			return PG_ASYNC_WRITE_WOULDBLOCK;
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
@@ -327,3 +343,28 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
 			elog(FATAL, "invalid return %d from PQflush", result);
 	}
 }
+
+static bool
+libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size)
+{
+	int result;
+
+	/* If we are in non-blocking mode, switch out of it. */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
+
+	/* Ths function is very similar to libpqprop_async_write. For more
+	 * information, refer to the comments there */
+	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
+		return false;
+
+	Assert(result == 1);
+
+	/* Because the connection is non-blocking, flushing returns 0 or -1 */
+
+	if ((result = PQflush(conn->pg_conn)) == -1)
+		return false;
+
+	Assert(result == 0);
+	return true;
+}
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 8e46f52b15f..1bbe5f30b3a 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -54,11 +54,13 @@ static int          donor;     /* Most advanced acceptor */
 static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
-static uint32       request_poll_immediate; /* bitset of walkeepers requesting AdvancePollState */
 
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
-static bool ReadPGAsyncIntoValue(int i, void* value, size_t value_size);
+static bool AsyncRead(int i, void* value, size_t value_size);
+static bool BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state);
+static bool AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
+static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
 static void HackyRemoveWalProposerEvent(int to_remove);
 
 /*
@@ -104,39 +106,20 @@ InitEventSet(void)
 }
 
 /*
- * Updates the stored wait event for the walkeeper, given its current sockWaitState
+ * Updates the events we're already waiting on for the WAL keeper, setting it to
+ * the provided `events`
  *
- * remove_if_nothing specifies whether to remove the event if the new waiting set is empty. In
- * certain cases, we have remove_if_nothing = false because it's known that the walkeeper state will
- * be updated immediately after if it's not waiting for any events.
- *
- * In general, setting remove_if_nothing = false is just an optimization; setting it to true will
- * almost always be correct. Please leave a comment arguing for the validity of this optimization if
- * you use it.
+ * This function is called any time the WAL keeper's state switches to one where
+ * it has to wait to continue. This includes the full body of AdvancePollState
+ * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
  */
 static void
-UpdateEventSet(int i, bool remove_if_nothing)
+UpdateEventSet(WalKeeper* wk, uint32 events)
 {
-	uint32 events;
-	WalKeeper* wk = &walkeeper[i];
-
-	/*
-	 * If there isn't an applicable way to update the event, we just don't bother. This function is
-	 * sometimes called when the walkeeper isn't waiting for anything, and so the best thing to do
-	 * is just nothing.
-	 */
-	if (wk->sockWaitState != WANTS_NO_WAIT)
-	{
-		events = WaitKindAsEvents(wk->sockWaitState);
+	/* eventPos = -1 when we don't have an event */
+	Assert(wk->eventPos != -1);
 
-		/* If we don't already have an event, add one! */
-		if (wk->eventPos == -1)
-			wk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(wk->conn), NULL, wk);
-		else
-			ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
-	}
-	else if (remove_if_nothing && wk->eventPos != 1)
-		HackyRemoveWalProposerEvent(i);
+	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
 }
 
 /* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
@@ -155,54 +138,43 @@ HackyRemoveWalProposerEvent(int to_remove)
 	InitEventSet();
 
 	/* loop through the existing walkeepers. If they aren't the one we're removing, and if they have
-	 * a socket we can use, re-add the applicable events.
-	 *
-	 * We're expecting that there's no other walkeepers with `.sockWaitState = WANTS_NO_WAIT`,
-	 * because any state without waiting should should have been handled immediately. */
+	 * a socket we can use, re-add the applicable events. */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		walkeeper[i].eventPos = -1;
+		uint32 desired_events = WL_NO_EVENTS;
+		WalKeeper* wk = &walkeeper[i];
+
+		wk->eventPos = -1;
 
 		if (i == to_remove)
 			continue;
 
-		if (walkeeper[i].conn)
+		/* If this WAL keeper isn't offline, add an event for it! */
+		if ((desired_events = WalKeeperStateDesiredEvents(wk->state)))
 		{
-			UpdateEventSet(i, false);
-
-			if (walkeeper[i].sockWaitState == WANTS_NO_WAIT)
-			{
-				elog(FATAL, "Unexpected walkeeper %s:%s in %s state waiting for nothing",
-					 walkeeper[i].host, walkeeper[i].port, FormatWalKeeperState(walkeeper[i].state));
-			}
-			else
-			{
-				UpdateEventSet(i, false); /* Will either add an event or do nothing */
-			}
+			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
 		}
 	}
 }
 
 /* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
 static void
-ShutdownConnection(int i, bool remove_event)
+ShutdownConnection(int i)
 {
 	if (walkeeper[i].conn)
 		walprop_finish(walkeeper[i].conn);
 	walkeeper[i].conn = NULL;
 	walkeeper[i].state = SS_OFFLINE;
-	walkeeper[i].pollState = SPOLL_NONE;
-	walkeeper[i].sockWaitState = WANTS_NO_WAIT;
 	walkeeper[i].currMsg = NULL;
 
-	if (remove_event)
-		HackyRemoveWalProposerEvent(i);
+	HackyRemoveWalProposerEvent(i);
 }
 
 /*
- * This function is called to establish new connection or to reestablish connection in case
- * of connection failure.
- * Close current connection if any and try to initiate new one
+ * This function is called to establish new connection or to reestablish
+ * connection in case of connection failure.
+ *
+ * On success, sets the state to SS_CONNECTING_WRITE.
  */
 static void
 ResetConnection(int i)
@@ -214,7 +186,7 @@ ResetConnection(int i)
 	{
 		elog(WARNING, "Connection with node %s:%s in %s state failed",
 			wk->host, wk->port, FormatWalKeeperState(wk->state));
-		ShutdownConnection(i, true);
+		ShutdownConnection(i);
 	}
 
 	/* Try to establish new connection
@@ -234,9 +206,6 @@ ResetConnection(int i)
 	if (!wk->conn)
 		elog(FATAL, "failed to allocate new PGconn object");
 
-	/* The connection should always be non-blocking. It's easiest to just set that here. */
-	walprop_set_nonblocking(wk->conn, true);
-
 	/* PQconnectStart won't actually start connecting until we run PQconnectPoll. Before we do that
 	 * though, we need to check that it didn't immediately fail. */
 	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
@@ -267,9 +236,7 @@ ResetConnection(int i)
 	 */
 	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
 
-	wk->state = SS_CONNECTING;
-	wk->pollState = SPOLL_CONNECT;
-	wk->sockWaitState = WANTS_SOCK_WRITE;
+	wk->state = SS_CONNECTING_WRITE;
 
 	sock = walprop_socket(wk->conn);
 	wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, wk);
@@ -467,6 +434,9 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 
 /*
  * Send message to the particular node
+ *
+ * Always updates the state and event set for the WAL keeper; setting either of
+ * these before calling would be redundant work.
  */
 static void
 SendMessageToNode(int i, WalMessage* msg)
@@ -492,19 +462,15 @@ SendMessageToNode(int i, WalMessage* msg)
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
 		/* Once we've selected and set up our message, actually start sending it. */
-		wk->state         = SS_SEND_WAL;
-		wk->pollState     = SPOLL_NONE;
-		wk->sockWaitState = WANTS_NO_WAIT;
+		wk->state = SS_SEND_WAL;
 		/* Don't ned to update the event set; that's done by AdvancePollState */
 
 		AdvancePollState(i, WL_NO_EVENTS);
 	}
 	else
 	{
-		wk->state         = SS_IDLE;
-		wk->pollState     = SPOLL_IDLE;
-		wk->sockWaitState = WANTS_SOCK_READ;
-		UpdateEventSet(i, true);
+		wk->state = SS_IDLE;
+		UpdateEventSet(wk, WL_SOCKET_READABLE);
 	}
 }
 
@@ -761,45 +727,6 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	return true;
 }
 
-/* Requests the currently-running WalProposerPoll to advance the state of this walkeeper */
-static void
-RequestStateAdvanceNoPoll(int i)
-{
-	/* We only have to change the value here; it'll be detected in a call to
-	 * AdvancePollForAllRequested when that's made. */
-	request_poll_immediate |= (1 << i);
-}
-
-static void
-AdvancePollForAllRequested(void)
-{
-	uint32 poll_set = request_poll_immediate;
-
-	/*
-	 * We have this in a loop because -- in theory -- polling the requested states could produce
-	 * more that are ready to be polled, though this *really* shouldn't occur in practice.
-	 */
-	while ((poll_set = request_poll_immediate))
-	{
-		/* "Take responsibility" for the poll set. We don't want any possibility of other calls to
-		 * AdvancePollForAllRequested duplicating an AdvancePollState. */
-		request_poll_immediate = 0;
-
-		/*
-		 * Loop through all nonzero bits and call AdvancePollState
-		 *
-		 * FIXME: This can probably be much more efficient, using something like __builtin__clz.
-		 * Maybe it doesn't matter though.
-		 */
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			/* If the ith bit is set, that state requested advancement */
-			if (poll_set & (1 << i))
-				AdvancePollState(i, WL_NO_EVENTS);
-		}
-	}
-}
-
 /*
  * Advance the WAL proposer state machine, waiting each time for events to occur
  */
@@ -819,22 +746,12 @@ WalProposerPoll(void)
 		wk = (WalKeeper*) event.user_data;
 		i = (int)(wk - walkeeper);
 
-		if (rc != 0)
-		{
-			/*
-			 * If the event contains something that one of our walkeeper states
-			 * was waiting for, we'll advance its state.
-			 */
-			if (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
-				AdvancePollState(i, event.events);
-
-			/*
-			 * It's possible for AdvancePollState to result in extra states
-			 * being ready to immediately advance to the next state (with
-			 * pollState = SPOLL_NONE). We deal with that here.
-			 */
-			AdvancePollForAllRequested();
-		}
+		/*
+		 * If the event contains something that one of our walkeeper states
+		 * was waiting for, we'll advance its state.
+		 */
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)))
+			AdvancePollState(i, event.events);
 
 		/* If the timeout expired, attempt to reconnect to any walkeepers that we dropped */
 		ReconnectWalKeepers();
@@ -852,69 +769,69 @@ WalProposerPoll(void)
 	}
 }
 
-/* Performs the logic for advancing the state machine of the 'i'th walkeeper, given that a certain
- * set of events has occured. */
+/* Performs the logic for advancing the state machine of the 'i'th walkeeper,
+ * given that a certain set of events has occured. */
 static void
 AdvancePollState(int i, uint32 events)
 {
 	WalKeeper* wk = &walkeeper[i];
 
-	/* Continue polling all the while we don't need to wait.
-	 *
-	 * At the bottom of this function is "while (walkeeper[i].sockWaitState == WANTS_NO_WAIT)" */
-	do {
-		uint32 expected_events = WaitKindAsEvents(wk->sockWaitState);
+	/* Keep advancing the state while either:
+	 *   (a) the event is still unprocessed (usually because it's the first
+	 *       iteration of the loop), or
+	 *   (b) the state can execute, and does not need to wait for any socket
+	 *       events
+	 */
+	while (events || StateShouldImmediatelyExecute(wk->state))
+	{
+		/* Sanity check. We assume further down that the operations don't block
+		 * because the socket is ready. */
+		AssertEventsOkForState(events, wk);
 
-		/* If we were expecting SOME event but nothing happened, panic. */
-		if ((expected_events & events) == 0 && expected_events)
+		/* Execute the code corresponding to the current state */
+		switch (wk->state)
 		{
-			elog(FATAL,
-				 "unexpected event for WalKeeper poll. Expected %s, found code %s (see: FormatEvents).",
-				 FormatWKSockWaitKind(wk->sockWaitState), FormatEvents(events));
-		}
+			/* WAL keepers are only taken out of SS_OFFLINE by calls to
+			 * ResetConnection */
+			case SS_OFFLINE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
+					 wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
 
-		/* Now that we've checked the event is ok, we'll actually run the thing we're looking for */
-		switch (wk->pollState)
-		{
-			/* If the polling corresponds to a "full" operation, we'll skip straight to that - we
-			 * don't actually need to poll here. */
-			case SPOLL_NONE:
-			case SPOLL_RETRY:
-				/* Equivalent to 'break', but more descriptive. */
-				goto ExecuteNextProtocolState;
-
-			/* On idle polling states, we wait for the socket to open for reading. If this happens,
-			 * the connection has closed *normally*, so we're just done. */
-			case SPOLL_IDLE:
-				elog(LOG, "Walkeeper %s:%s closed connection from %s state",
-						wk->host, wk->port, FormatWalKeeperState(wk->state));
-				/* 'true' to remove existing event for this walkeeper */
-				ShutdownConnection(i, true);
-				return;
-
-			/* Call PQconnectPoll to finalize the connection */
-			case SPOLL_CONNECT:
+			/* Both connecting states run the same logic. The only difference is
+			 * the events they're expecting */
+			case SS_CONNECTING_READ:
+			case SS_CONNECTING_WRITE:
 			{
 				WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
-				pgsocket                         new_sock = walprop_socket(wk->conn);
+
+				/* The new set of events we'll wait on, after updating */
+				uint32 new_events = WL_NO_EVENTS;
 
 				switch (result)
 				{
 					case WP_CONN_POLLING_OK:
 						elog(LOG, "Connected with node %s:%s", wk->host, wk->port);
 
-						/* If we're fully connected, we're good! We can move on to the next state */
+						/* Once we're fully connected, we can move to the next state */
 						wk->state = SS_EXEC_STARTWALPUSH;
 
-						/* Update the socket -- it might have changed */
-						HackyRemoveWalProposerEvent(i);
-
-						/* We need to just pick an event to wait on; this will be overriden
-						 * anyways later. */
-						wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, new_sock, NULL, wk);
+						/* Even though SS_EXEC_STARTWALPUSH doesn't wait on anything,
+						 * we do need to replace the current event, so we have to
+						 * just pick something. We'll eventually need the socket to
+						 * be readable, so we go with that. */
+						new_events = WL_SOCKET_READABLE;
+						break;
 
-						/* We're done, but some of the other result cases have cleanup left to do */
-						goto ExecuteNextProtocolState;
+					/* If we need to poll to finish connecting, continue doing that */
+					case WP_CONN_POLLING_READING:
+						wk->state = SS_CONNECTING_READ;
+						new_events = WL_SOCKET_READABLE;
+						break;
+					case WP_CONN_POLLING_WRITING:
+						wk->state = SS_CONNECTING_WRITE;
+						new_events = WL_SOCKET_WRITEABLE;
+						break;
 
 					case WP_CONN_POLLING_FAILED:
 						elog(WARNING, "Failed to connect to node '%s:%s': %s",
@@ -922,154 +839,21 @@ AdvancePollState(int i, uint32 events)
 						/* If connecting failed, we don't want to restart the connection because
 						 * that might run us into a loop. Instead, shut it down -- it'll naturally
 						 * restart at a slower interval on calls to ReconnectWalKeepers. */
-						ShutdownConnection(i, true);
+						ShutdownConnection(i);
 						return;
-
-					case WP_CONN_POLLING_READING:
-						wk->sockWaitState = WANTS_SOCK_READ;
-						break;
-
-					case WP_CONN_POLLING_WRITING:
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
 				}
 
-				/* If we got here, we either have to wait for reading or
-				 * writing. The value of walkeeper[i].sockWaitState indicates
-				 * which one of these it is.
-				 *
-				 * We also have to update the socket here, even if the file
-				 * descriptor itself hasn't changed. It's possible for libpq to
-				 * close the socket and then open a new one, reusing the same
-				 * file descriptor. If this happens, epoll will have
-				 * automatically removed the socket, so we'll stop receiving
-				 * events for it unless we re-add the socket.
-				 *
-				 * To update the socket, we the event and add a new one back.
-				 */
+				/* Because PQconnectPoll can change the socket, we have to
+				 * un-register the old event and re-register an event on the new
+				 * socket. */
 				HackyRemoveWalProposerEvent(i);
-
-				wk->eventPos = AddWaitEventToSet(waitEvents, WaitKindAsEvents(wk->sockWaitState), new_sock, NULL, wk);
-
-				/* We still have polling to do, so we can't move on to the next state. */
-				return;
-			}
-
-			case SPOLL_WRITE_PQ_FLUSH:
-			{
-				int flush_result;
-
-				/* If the socket is ready for reading, we have to call PQconsumeInput before
-				 * attempting to flush. */
-				if (events & WL_SOCKET_READABLE)
-				{
-					/* PQconsumeInput returns 1 if ok, 0 if there was an error */
-					if (!walprop_consume_input(wk->conn))
-					{
-						elog(WARNING, "Failed to pre-flush read input for node %s:%s in state [%s]: %s",
-							 wk->host, wk->port, FormatWalKeeperState(wk->state),
-							 walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-					}
-				}
-
-				/* PQflush returns:
-				 *   0 if uccessful,
-				 *   1 if unable to send everything yet,
-				 *  -1 if it failed */
-				switch (flush_result = walprop_flush(wk->conn))
-				{
-					case 0:
-						/* On success, go to the next state. Our current state only indicates the
-						 * state that *started* the writing, so we need to use that to figure out
-						 * what to do next. */
-						switch (wk->state)
-						{
-							case SS_EXEC_STARTWALPUSH:
-								wk->state = SS_WAIT_EXEC_RESULT;
-								break;
-							case SS_HANDSHAKE_SEND:
-								wk->state = SS_HANDSHAKE_RECV;
-								break;
-							case SS_SEND_VOTE:
-								wk->state = SS_WAIT_VERDICT;
-								break;
-							case SS_SEND_WAL:
-								wk->state = SS_RECV_FEEDBACK;
-								break;
-							default:
-								elog(FATAL, "Unexpected writing state [%s] for node %s:%s",
-									FormatWalKeeperState(wk->state), wk->host, wk->port);
-						}
-
-						wk->pollState = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case 1:
-						/* Nothing more to do - we'll just have to wait until we can flush again */
-						return;
-					case -1:
-						elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-							 wk->host, wk->port, FormatWalKeeperState(wk->state),
-							 walprop_error_message(wk->conn));
-						ResetConnection(i);
-						break;
-					default:
-						elog(FATAL, "invalid return %d from PQflush", flush_result);
-				}
+				wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
 				break;
 			}
 
-			case SPOLL_PQ_CONSUME_AND_RETRY:
-				/* PQconsumeInput returns 1 on success (though maybe nothing was read), and 0 on
-				 * failure. */
-				if (walprop_consume_input(wk->conn))
-					/* On success, retry the operation */
-					goto ExecuteNextProtocolState;
-				else
-				{
-					/* On failure, print the failure and move on */
-					elog(WARNING, "Failed to read input for node %s:%s in state %s: %s",
-						wk->host, wk->port, FormatWalKeeperState(wk->state),
-						walprop_error_message(wk->conn));
-					ResetConnection(i);
-					return;
-				}
-		}
-
-ExecuteNextProtocolState:
-		/* If we get here, walkeeper[i].pollState now corresponds to either SPOLL_NONE or
-		 * SPOLL_RETRY. In either case, we should execute the operation described by the high-level
-		 * state.
-		 *
-		 * All of the cases in this switch statement are provided in the order that state
-		 * transitions happen, moving downwards. So `SS_CONNECTING` moves into
-		 * `SS_EXEC_STARTWALPUSH`, `SS_EXEC_STARTWALPUSH` moves into `SS_WAIT_EXEC_RESULT`, etc.
-		 *
-		 * If/when new states are added, they should abide by the same formatting.
-		 *
-		 * More information about the high-level flow between states is available in the comments
-		 * for WalKeeperState. */
-		switch (wk->state)
-		{
-			/* walkeepers aren't taken out of SS_OFFLINE by polling. */
-			case SS_OFFLINE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline", wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
-
-			/* Connecting is handled by the SPOLL_CONNECT, which then puts us into
-			 * SS_EXEC_STARTWALPUSH. There's no singular state advancement to be made here. */
-			case SS_CONNECTING:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is connecting", wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
-
-			/* Send "START_WAL_PUSH" command to the walkeeper. After sending, wait for response with
-			 * SS_WAIT_EXEC_RESULT */
+			/* Send "START_WAL_PUSH" command to the walkeeper. After sending,
+			 * wait for response with SS_WAIT_EXEC_RESULT */
 			case SS_EXEC_STARTWALPUSH:
-			{
-				int flush_result;
-
 				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
@@ -1078,53 +862,25 @@ AdvancePollState(int i, uint32 events)
 					return;
 				}
 
-				/* The query has been started (put into buffers), but hasn't been flushed yet. We
-				 * should do that now. If there's more flushing required, keep doing that until it's
-				 * done */
-				switch ((flush_result = walprop_flush(wk->conn)))
-				{
-					case 0:
-						/* success -- go to the next state */
-						wk->state = SS_WAIT_EXEC_RESULT;
-						wk->pollState = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case 1:
-						/* we'll have to flush again */
-						wk->pollState = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-					case -1:
-						elog(WARNING, "Failed to flush write to node %s:%s to exec command: %s",
-								wk->host, wk->port, walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-					default:
-						elog(FATAL, "invalid return %d from PQflush", flush_result);
-				}
-
-				/* If no waiting is required, we'll get to that shortly */
-				UpdateEventSet(i, false);
+				wk->state = SS_WAIT_EXEC_RESULT;
+				UpdateEventSet(wk, WL_SOCKET_READABLE);
 				break;
-			}
 
-			/* Waiting for the result of the "START_WAL_PUSH" command. If successful, proceed to
-			 * SS_HANDSHAKE_SEND. If needs more, wait until we can read and retry. */
 			case SS_WAIT_EXEC_RESULT:
-				/* Call our wrapper around PQisBusy + PQgetResult to inspect the result */
 				switch (walprop_get_query_result(wk->conn))
 				{
 					/* Successful result, move on to starting the handshake */
 					case WP_EXEC_SUCCESS_COPYBOTH:
-						wk->state         = SS_HANDSHAKE_SEND;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
+						/* Because this state is immediately executable, we'll
+						 * start this on the next iteration of the loop */
+						wk->state = SS_HANDSHAKE_SEND;
 						break;
 
-					/* We need more calls to PQconsumeInput to completely receive this result */
+					/* Needs repeated calls to finish. Wait until the socket is
+					 * readable */
 					case WP_EXEC_NEEDS_INPUT:
-						wk->pollState     = SPOLL_PQ_CONSUME_AND_RETRY;
-						wk->sockWaitState = WANTS_SOCK_READ;
+						/* SS_WAIT_EXEC_RESULT is always reached through an
+						 * event, so we don't need to update the event set */
 						break;
 
 					case WP_EXEC_FAILED:
@@ -1139,65 +895,34 @@ AdvancePollState(int i, uint32 events)
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
 								wk->host, wk->port);
 						ResetConnection(i);
-						break;
+						return;
 				}
-
-				/* If the wait state is empty, don't remove the event -- we have more work to do */
-				UpdateEventSet(i, false);
-
 				break;
 
-			/* Start handshake: first of all send information about server */
+			/* Start handshake: first of all send information about the WAL
+			 * keeper. After sending, we wait on SS_HANDSHAKE_RECV for a
+			 * response to finish the handshake. */
 			case SS_HANDSHAKE_SEND:
-				/* Note: This state corresponds to the process of sending the relevant information
-				 * along. The moment we finish sending, we use SS_HANDSHAKE_RECV to complete the
-				 * handshake. */
-				switch (walprop_async_write(wk->conn, &proposerGreeting, sizeof(proposerGreeting)))
-				{
-					case PG_ASYNC_WRITE_SUCCESS:
-						/* If the write immediately succeeds, we can move on to the next state. */
-						wk->state         = SS_HANDSHAKE_RECV;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-
-					case PG_ASYNC_WRITE_WOULDBLOCK:
-						/* Wait until the socket is write-ready and try again */
-						wk->pollState     = SPOLL_RETRY;
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
-
-					case PG_ASYNC_WRITE_TRY_FLUSH:
-						/* We need to call PQflush some number of additional times, with different
-						 * actions depending on whether the socket is readable or writable */
-						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-
-					case PG_ASYNC_WRITE_FAIL:
-						/* On failure, print the error and reset the connection */
-						elog(WARNING, "Handshake with node %s:%s failed to start: %s",
-								wk->host, wk->port, walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-				}
+				/* On failure, logging & resetting the connection is handled. We
+				 * just need to handle the control flow. */
+				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
+					return;
 
-				/* Update the event set for this walkeeper, depending on what it's been changed to
-				 *
-				 * We set remove_if_nothing = false because we'll immediately execute
-				 * SS_HANDSHAKE_RECV on the next iteration of the outer loop. */
-				UpdateEventSet(i, false);
 				break;
 
-			/* Finish handshake comms: receive information about the walkeeper */
+			/* Finish handshake comms: receive information about the WAL keeper */
 			case SS_HANDSHAKE_RECV:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->greet, sizeof(wk->greet)))
+				if (!AsyncRead(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
-				wk->state     = SS_VOTING;
-				wk->pollState = SPOLL_IDLE;
+				/* Protocol is all good, move to voting. */
+				wk->state = SS_VOTING;
+				/* Don't need to update the event set yet. Either we update the
+				 * event set to WL_SOCKET_READABLE *or* we change the state to
+				 * SS_SEND_VOTE in the loop below */
+				UpdateEventSet(wk, WL_SOCKET_READABLE);
 				wk->feedback.flushLsn = truncateLsn;
 				wk->feedback.hs.ts = 0;
 
@@ -1211,7 +936,13 @@ AdvancePollState(int i, uint32 events)
 				 * We'll eventually get a task when the election starts.
 				 *
 				 * If we do have quorum, we can start an election */
-				if (++n_connected >= quorum)
+				if (++n_connected < quorum)
+				{
+					/* SS_VOTING is an idle state; read-ready indicates the
+					 * connection closed. */
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+				}
+				else
 				{
 					if (n_connected == quorum)
 					{
@@ -1233,13 +964,8 @@ AdvancePollState(int i, uint32 events)
 						if (walkeeper[j].state == SS_VOTING)
 						{
 							walkeeper[j].state = SS_SEND_VOTE;
-							walkeeper[j].pollState = SPOLL_NONE;
-							walkeeper[j].sockWaitState = WANTS_NO_WAIT;
-
-							/* If this isn't the current walkeeper, defer handling this state until
-							 * later. We'll mark it for individual work in WalProposerPoll. */
-							if (j != i)
-								RequestStateAdvanceNoPoll(j);
+							/* Immediately send info */
+							AdvancePollState(j, WL_NO_EVENTS);
 						}
 					}
 				}
@@ -1249,53 +975,26 @@ AdvancePollState(int i, uint32 events)
 			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
 			 * SS_SEND_VOTE. */
 			case SS_VOTING:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting", wk->host, wk->port);
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting",
+					 wk->host, wk->port);
 				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
 
 			/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
-				switch (walprop_async_write(wk->conn, &voteRequest, sizeof(voteRequest)))
-				{
-					case PG_ASYNC_WRITE_SUCCESS:
-						/* If the write immediately succeeds, we can move on to the next state. */
-						wk->state         = SS_WAIT_VERDICT;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case PG_ASYNC_WRITE_WOULDBLOCK:
-						/* Wait until the socket is write-ready and try again */
-						wk->pollState     = SPOLL_RETRY;
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
-					case PG_ASYNC_WRITE_TRY_FLUSH:
-						/* We need to call PQflush some number of additional times, with different
-						 * actions depending on whether the socket is readable or writable */
-						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-					case PG_ASYNC_WRITE_FAIL:
-						/* Report the failure and reset the connection; there isn't much
-						 * more we can do. */
-						elog(WARNING, "Failed to send vote request to node %s:%s: %s",
-								wk->host, wk->port,
-								walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-				}
+				/* On failure, logging & resetting is handled */
+				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+					return;
 
-				/* Don't remove from the event set if there's nothing we're waiting for; we'll get
-				 * it on the next iteration of the loop */
-				UpdateEventSet(i, false);
+				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 				break;
 
 			/* Start reading the walkeeper response for our candidate */
 			case SS_WAIT_VERDICT:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->voteResponse, sizeof(wk->voteResponse)))
+				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
-
 				/*
 				 * In case of acceptor rejecting our vote, bail out, but only if
 				 * either it already lives in strictly higher term (concurrent
@@ -1312,12 +1011,17 @@ AdvancePollState(int i, uint32 events)
 				Assert(wk->voteResponse.term == propTerm);
 
 				/* Handshake completed, do we have quorum? */
-				wk->state         = SS_IDLE;
-				wk->pollState     = SPOLL_IDLE;
-				wk->sockWaitState = WANTS_NO_WAIT;
 
-				if (++n_votes == quorum)
+				if (++n_votes != quorum)
 				{
+					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+					SendMessageToNode(i, msgQueueHead);
+				}
+				else
+				{
+					wk->state = SS_IDLE;
+					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for read-ready */
+
 					DetermineEpochStartLsn();
 
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
@@ -1332,56 +1036,46 @@ AdvancePollState(int i, uint32 events)
 					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
 				}
-				else
-				{
-					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
-					SendMessageToNode(i, msgQueueHead);
-				}
 
 				break;
 
-			/* Start to send the message at wk->currMsg. Triggered only by calls to
+			/* Idle state for sending WAL. Moved out only by calls to
 			 * SendMessageToNode */
+			case SS_IDLE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* Start to send the message at wk->currMsg. Triggered only by calls
+			 * to SendMessageToNode */
 			case SS_SEND_WAL:
 			{
 				WalMessage* msg = wk->currMsg;
 
-				/* Don't repeat logs if we have to retry the actual send operation itself */
-				if (wk->pollState != SPOLL_RETRY)
-				{
-					elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
-						 msg->size - sizeof(AppendRequestHeader),
-						 LSN_FORMAT_ARGS(msg->req.commitLsn),
-						 LSN_FORMAT_ARGS(truncateLsn),
-						 wk->host, wk->port);
-				}
+				elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
+					 msg->size - sizeof(AppendRequestHeader),
+					 LSN_FORMAT_ARGS(msg->req.commitLsn),
+					 LSN_FORMAT_ARGS(truncateLsn),
+					 wk->host, wk->port);
 
-				switch (walprop_async_write(wk->conn, &msg->req, msg->size))
-				{
-					case PG_ASYNC_WRITE_SUCCESS:
-						wk->state         = SS_RECV_FEEDBACK;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case PG_ASYNC_WRITE_WOULDBLOCK:
-						wk->pollState = SPOLL_RETRY;
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
-					case PG_ASYNC_WRITE_TRY_FLUSH:
-						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-					case PG_ASYNC_WRITE_FAIL:
-						elog(WARNING, "Failed to send WAL to node %s:%s: %s",
-							 wk->host, wk->port, walprop_error_message(wk->conn));
-				}
+				/* We write with msg->size here because the body of the message
+				 * is stored after the end of the WalMessage struct, in the
+				 * allocation for each msg */
+				if (!AsyncWrite(i, &msg->req, msg->size, SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+					return;
 
-				/* Don't remove if if sockWaitState == WANTS_NO_WAIT, because we'll immediately move
-				 * on to SS_RECV_FEEDBACK if that's the case. */
-				UpdateEventSet(i, false);
 				break;
 			}
 
+			/* Flush the WAL message we're sending from SS_SEND_WAL */
+			case SS_SEND_WAL_FLUSH:
+				/* AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll wait
+				 * until the next poll comes along. */
+				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0, SS_RECV_FEEDBACK))
+					return;
+
+				break;
+
 			/* Start to receive the feedback from a message sent via SS_SEND_WAL */
 			case SS_RECV_FEEDBACK:
 			{
@@ -1391,21 +1085,16 @@ AdvancePollState(int i, uint32 events)
 
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->feedback, sizeof(wk->feedback)))
+				if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
 					return;
 
 				next = wk->currMsg->next;
 				Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
 				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
 
-				wk->state         = SS_IDLE;
-				wk->pollState     = SPOLL_IDLE;
-				wk->sockWaitState = WANTS_NO_WAIT;
-				/* Don't update the event set; that's handled by SendMessageToNode if necessary */
-
 				wk->currMsg = NULL;
 				HandleWalKeeperResponse();
-				SendMessageToNode(i, next);
+				SendMessageToNode(i, next); /* Updates state & event set */
 
 				/*
 				 * Also send the new commit lsn to all the walkeepers.
@@ -1424,30 +1113,29 @@ AdvancePollState(int i, uint32 events)
 				}
 				break;
 			}
-
-			/* Truly an idle state - there isn't any typ of advancement expected here. */
-			case SS_IDLE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
-				break; /* actually unreachable; makes the compiler happier */
 		}
 
-		/* On subsequent iterations of the loop, there's no additonal events to process */
+		/* We've already done something for these events - don't attempt more
+		 * states than we need to. */
 		events = WL_NO_EVENTS;
-	} while (walkeeper[i].sockWaitState == WANTS_NO_WAIT && walkeeper[i].pollState != SPOLL_IDLE);
+	}
 }
 
 /*
- * Reads a CopyData block into a value, returning whether the read was successful
+ * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
+ * returning whether the read was successful.
  *
- * If the read was not immediately successful (either polling is required, or it actually failed),
- * then the state is set appropriately on the walkeeper.
+ * If the read needs more polling, we return 'false' and keep the state
+ * unmodified, waiting until it becomes read-ready to try again. If it fully
+ * failed, a warning is emitted and the connection is reset.
  */
-bool
-ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
+static bool
+AsyncRead(int i, void* value, size_t value_size)
 {
 	WalKeeper* wk = &walkeeper[i];
 	char *buf = NULL;
 	int buf_size = -1;
+	uint32 events;
 
 	switch (walprop_async_read(wk->conn, &buf, &buf_size))
 	{
@@ -1455,14 +1143,10 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 		case PG_ASYNC_READ_SUCCESS:
 			break;
 
-		case PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN:
-			wk->pollState = SPOLL_PQ_CONSUME_AND_RETRY;
-
-			if (wk->sockWaitState != WANTS_SOCK_READ)
-			{
-				wk->sockWaitState = WANTS_SOCK_READ;
-				UpdateEventSet(i, true);
-			}
+		/* If we need more input, wait until the socket is read-ready and try
+		 * again. */
+		case PG_ASYNC_READ_TRY_AGAIN:
+			UpdateEventSet(wk, WL_SOCKET_READABLE);
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
@@ -1477,7 +1161,7 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 	/*
 	 * If we get here, the read was ok, but we still need to check it was the right amount
 	 */
-	if (buf_size != value_size)
+	if ((size_t) buf_size != value_size)
 	{
 		elog(FATAL,
 			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
@@ -1488,6 +1172,131 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 
 	/* Copy the resulting info into place */
 	memcpy(value, buf, buf_size);
+
+	/* Update the events for the WalKeeper, if it's going to wait */
+	events = WalKeeperStateDesiredEvents(wk->state);
+	if (events)
+		UpdateEventSet(wk, events);
+
+	return true;
+}
+
+/*
+ * Blocking equivalent to AsyncWrite.
+ *
+ * We use this everywhere messages are small enough that they should fit in a
+ * single packet.
+ */
+static bool
+BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
+{
+	WalKeeper* wk = &walkeeper[i];
+	uint32 events;
+
+	if (!walprop_blocking_write(wk->conn, msg, msg_size))
+	{
+		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+			 wk->host, wk->port, FormatWalKeeperState(wk->state),
+			 walprop_error_message(wk->conn));
+		ResetConnection(i);
+		return false;
+	}
+
+	wk->state = success_state;
+
+	/* If the new state will be waiting for events to happen, update the event
+	 * set to wait for those */
+	events = WalKeeperStateDesiredEvents(success_state);
+	if (events)
+		UpdateEventSet(wk, events);
+
+	return true;
+}
+
+/*
+ * Starts a write into the 'i'th WAL keeper's postgres connection, moving to
+ * success_state only when the write succeeds. If the write needs flushing,
+ * moves to flush_state.
+ *
+ * Returns false only if the write immediately fails. Upon failure, a warning is
+ * emitted and the connection is reset.
+ */
+static bool
+AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
+{
+	WalKeeper* wk = &walkeeper[i];
+	uint32 events;
+
+	switch (walprop_async_write(wk->conn, msg, msg_size))
+	{
+		case PG_ASYNC_WRITE_SUCCESS:
+			wk->state = success_state;
+			break;
+		case PG_ASYNC_WRITE_TRY_FLUSH:
+			/* We still need to call PQflush some more to finish the job; go to
+			 * the appropriate state. Update the event set at the bottom of this
+			 * function */
+			wk->state = flush_state;
+			break;
+		case PG_ASYNC_WRITE_FAIL:
+			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+				 wk->host, wk->port, FormatWalKeeperState(wk->state),
+				 walprop_error_message(wk->conn));
+			ResetConnection(i);
+			return false;
+	}
+
+	/* If the new state will be waiting for something, update the event set */
+	events = WalKeeperStateDesiredEvents(wk->state);
+	if (events)
+		UpdateEventSet(wk, events);
+
+	return true;
+}
+
+/*
+ * Flushes a previous call to AsyncWrite. This only needs to be called when the
+ * socket becomes read or write ready *after* calling AsyncWrite.
+ *
+ * If flushing completes, moves to 'success_state' and returns true. If more
+ * flushes are needed, does nothing and returns true.
+ *
+ * On failure, emits a warning, resets the connection, and returns false.
+ */
+static bool
+AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
+{
+	WalKeeper* wk = &walkeeper[i];
+	uint32 events;
+
+	/* PQflush returns:
+	 *   0 if successful                    [we're good to move on]
+	 *   1 if unable to send everything yet [call PQflush again]
+	 *  -1 if it failed                     [emit an error]
+	 */
+	switch (walprop_flush(wk->conn, socket_read_ready))
+	{
+		case 0:
+			/* On success, move to the next state - that logic is further down */
+			break;
+		case 1:
+			/* Nothing to do; try again when the socket's ready */
+			return true;
+		case -1:
+			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+				 wk->host, wk->port, FormatWalKeeperState(wk->state),
+				 walprop_error_message(wk->conn));
+			ResetConnection(i);
+			return false;
+	}
+
+	wk->state = success_state;
+
+	/* If the new state will be waiting for something, update the event set */
+	events = WalKeeperStateDesiredEvents(wk->state);
+	if (events)
+		UpdateEventSet(wk, events);
+
 	return true;
 }
 
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 29c209e63c1..16d84ac7f17 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -21,36 +21,6 @@ CompareLsn(const void *a, const void *b)
 		return 1;
 }
 
-/* Converts a `WKSockWaitKind` into the bit flags that would match it
- *
- * Note: For `wait_kind = WANTS_NO_WAIT`, this will return a value of zero,
- * which does not match any events. Attempting to wait on no events will
- * always timeout, so it's best to double-check the value being provided to
- * this function where necessary. */
-uint32
-WaitKindAsEvents(WKSockWaitKind wait_kind)
-{
-	uint32 return_val;
-
-	switch (wait_kind)
-	{
-		case WANTS_NO_WAIT:
-			return_val = WL_NO_EVENTS;
-			break;
-		case WANTS_SOCK_READ:
-			return_val = WL_SOCKET_READABLE;
-			break;
-		case WANTS_SOCK_WRITE:
-			return_val = WL_SOCKET_WRITEABLE;
-			break;
-		case WANTS_SOCK_EITHER:
-			return_val = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-	}
-
-	return return_val;
-}
-
 /* Returns a human-readable string corresonding to the WalKeeperState
  *
  * The string should not be freed.
@@ -66,14 +36,15 @@ WaitKindAsEvents(WKSockWaitKind wait_kind)
 char*
 FormatWalKeeperState(WalKeeperState state)
 {
-	char* return_val;
+	char* return_val = NULL;
 
 	switch (state)
 	{
 		case SS_OFFLINE:
 			return_val = "offline";
 			break;
-		case SS_CONNECTING:
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
 			return_val = "connecting";
 			break;
 		case SS_EXEC_STARTWALPUSH:
@@ -103,39 +74,111 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_SEND_WAL:
 			return_val = "WAL-sending";
 			break;
+		case SS_SEND_WAL_FLUSH:
+			return_val = "WAL-sending (flushing)";
+			break;
 		case SS_RECV_FEEDBACK:
 			return_val = "WAL-feedback-receiving";
 			break;
 	}
 
+	Assert(return_val != NULL);
+
 	return return_val;
 }
 
-/* Returns a human-readable string corresponding to the WKSockWaitKind
+/* Asserts that the provided events are expected for given WAL keeper's state */
+void
+AssertEventsOkForState(uint32 events, WalKeeper* wk)
+{
+	uint32 expected = WalKeeperStateDesiredEvents(wk->state);
+
+	/* The events are in-line with what we're expecting, under two conditions:
+	 *   (a) if we aren't expecting anything, `events` has no read- or
+	 *       write-ready component.
+	 *   (b) if we are expecting something, there's overlap
+	 *       (i.e. `events & expected != 0`)
+	 */
+	bool events_ok_for_state; /* long name so the `Assert` is more clear later */
+
+	if (expected == WL_NO_EVENTS)
+		events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0);
+	else
+		events_ok_for_state = ((events & expected) != 0);
+
+	if (!events_ok_for_state)
+	{
+		/* To give a descriptive message in the case of failure, we use elog and
+		 * then an assertion that's guaranteed to fail. */
+		elog(WARNING, "events %s mismatched for walkeeper %s:%s in state [%s]",
+			 FormatEvents(events), wk->host, wk->port, FormatWalKeeperState(wk->state));
+		Assert(events_ok_for_state);
+	}
+}
+
+/* Returns the set of events a WAL keeper in this state should be waiting on
  *
- * The string should not be freed. */
-char*
-FormatWKSockWaitKind(WKSockWaitKind wait_kind)
+ * This will return WL_NO_EVENTS (= 0) for some events. */
+uint32
+WalKeeperStateDesiredEvents(WalKeeperState state)
 {
-	char* return_val;
+	uint32 result;
 
-	switch (wait_kind)
+	/* If the state doesn't have a modifier, we can check the base state */
+	switch (state)
 	{
-		case WANTS_NO_WAIT:
-			return_val = "<no events>";
+		/* Connecting states say what they want in the name */
+		case SS_CONNECTING_READ:
+			result = WL_SOCKET_READABLE;
 			break;
-		case WANTS_SOCK_READ:
-			return_val = "<read event>";
+		case SS_CONNECTING_WRITE:
+			result = WL_SOCKET_WRITEABLE;
 			break;
-		case WANTS_SOCK_WRITE:
-			return_val = "<write event>";
+
+		/* Reading states need the socket to be read-ready to continue */
+		case SS_WAIT_EXEC_RESULT:
+		case SS_HANDSHAKE_RECV:
+		case SS_WAIT_VERDICT:
+		case SS_RECV_FEEDBACK:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* Most writing states don't require any socket conditions */
+		case SS_EXEC_STARTWALPUSH:
+		case SS_HANDSHAKE_SEND:
+		case SS_SEND_VOTE:
+		case SS_SEND_WAL:
+			result = WL_NO_EVENTS;
 			break;
-		case WANTS_SOCK_EITHER:
-			return_val = "<read or write event>";
+		/* but flushing does require read- or write-ready */
+		case SS_SEND_WAL_FLUSH:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
+		/* Idle states use read-readiness as a sign that the connection has been
+		 * disconnected. */
+		case SS_VOTING:
+		case SS_IDLE:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* The offline state expects no events. */
+		case SS_OFFLINE:
+			result = WL_NO_EVENTS;
 			break;
 	}
 
-	return return_val;
+	return result;
+}
+
+/* Returns whether the WAL keeper state corresponds to something that should be
+ * immediately executed -- i.e. it is not idle, and is not currently waiting. */
+bool
+StateShouldImmediatelyExecute(WalKeeperState state)
+{
+	/* This is actually pretty simple to determine. */
+	return WalKeeperStateDesiredEvents(state) == WL_NO_EVENTS
+		&& state != SS_OFFLINE;
 }
 
 /* Returns a human-readable string corresponding to the event set
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index af4d877963d..6741e9f82dd 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -22,11 +22,7 @@
  * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
  * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
  */
-#ifndef WL_NO_EVENTS
 #define WL_NO_EVENTS 0
-#else
-#error "WL_NO_EVENTS already defined"
-#endif
 
 extern char* wal_acceptors_list;
 extern int   wal_acceptor_reconnect_timeout;
@@ -46,9 +42,9 @@ typedef enum
 {
 	/* The full read was successful. buf now points to the data */
 	PG_ASYNC_READ_SUCCESS,
-	/* The read is ongoing. Wait until the connection is read-ready, then
-	 * call PQconsumeInput and try again. */
-	PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN,
+	/* The read is ongoing. Wait until the connection is read-ready, then try
+	 * again. */
+	PG_ASYNC_READ_TRY_AGAIN,
 	/* Reading failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_READ_FAIL,
 } PGAsyncReadResult;
@@ -58,9 +54,6 @@ typedef enum
 {
 	/* The write fully completed */
 	PG_ASYNC_WRITE_SUCCESS,
-	/* There wasn't space in the buffers to queue the data; wait until the
-	 * socket is write-ready and try again. */
-	PG_ASYNC_WRITE_WOULDBLOCK,
 	/* The write started, but you'll need to call PQflush some more times
 	 * to finish it off. We just tried, so it's best to wait until the
 	 * connection is read- or write-ready to try again.
@@ -73,98 +66,109 @@ typedef enum
 	PG_ASYNC_WRITE_FAIL,
 } PGAsyncWriteResult;
 
-/* WAL safekeeper state - high level */
+/*
+ * WAL safekeeper state
+ *
+ * States are listed here in the order that they're executed - with the only
+ * exception occuring from the "send WAL" cycle, which loops as:
+ *
+ *   SS_IDLE -> SS_SEND_WAL (+ flush) -> SS_RECV_FEEDBACK -> SS_IDLE/SS_SEND_WAL
+ *
+ * Most states, upon failure, will move back to SS_OFFLINE by calls to
+ * ResetConnection or ShutdownConnection.
+ *
+ * Also note: In places we say that a state "immediately" moves to another. This
+ * happens in states that only exist to execute program logic, so they run
+ * exactly once (when moved into), without waiting for any socket conditions.
+ *
+ * For example, when we set a WalKeeper's state to SS_SEND_VOTE, we immediately
+ * call AdvancePollState - during which the WalKeeper switches its state to
+ * SS_WAIT_VERDICT.
+ */
 typedef enum
 {
 	/*
 	 * Does not have an active connection and will stay that way until
-	 * further notice. May be paired with:
-	 *   - SPOLL_NONE
+	 * further notice.
 	 *
-	 * Moves to SS_CONNECTING only by calls to ResetConnection.
+	 * Moves to SS_CONNECTING_WRITE by calls to ResetConnection.
 	 */
 	SS_OFFLINE,
+
 	/*
-	 * Currently in the process of connecting. May be paired with:
-	 *   - SPOLL_CONNECT
+	 * Connecting states. "_READ" waits for the socket to be available for
+	 * reading, "_WRITE" waits for writing. There's no difference in the code
+	 * they execute when polled, but we have this distinction in order to
+	 * recreate the event set in HackyRemoveWalProposerEvent.
 	 *
 	 * After the connection is made, moves to SS_EXEC_STARTWALPUSH.
 	 */
-	SS_CONNECTING,
+	SS_CONNECTING_WRITE,
+	SS_CONNECTING_READ,
+
 	/*
-	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper. May be paired with:
-	 *   - SPOLL_NONE
-	 *   - SPOLL_WRITE_PQ_FLUSH
-	 *
-	 * After the query sends, moves to SS_WAIT_EXEC_RESULT.
+	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper.
+	 * Performs a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT.
 	 */
 	SS_EXEC_STARTWALPUSH,
 	/*
-	 * Waiting for the result of the "START_WAL_PUSH" command. May be paired with:
-	 *   - SPOLL_PQ_CONSUME_AND_RETRY
-	 *
-	 * We only pair with PQconsumeInput because we *need* to wait until the socket is open for
-	 * reading to try again.
+	 * Waiting for the result of the "START_WAL_PUSH" command.
 	 *
 	 * After we get a successful result, moves to SS_HANDSHAKE_SEND.
 	 */
 	SS_WAIT_EXEC_RESULT,
+
 	/*
-	 * Executing the sending half of the handshake. May be paired with:
-	 *   - SPOLL_WRITE_PQ_FLUSH if it hasn't finished sending,
-	 *   - SPOLL_RETRY          if buffers are full and we just need to try again,
-	 *   - SPOLL_NONE
-	 *
-	 * After sending, moves to SS_HANDSHAKE_RECV.
+	 * Executing the sending half of the handshake. Performs the blocking send,
+	 * then immediately moves to SS_HANDSHAKE_RECV.
 	 */
 	SS_HANDSHAKE_SEND,
 	/*
-	 * Executing the receiving half of the handshake. May be paired with:
-	 *   - SPOLL_PQ_CONSUME_AND_RETRY if we need more input
-	 *   - SPOLL_NONE
-	 *
-	 * After receiving, moves to SS_VOTING.
+	 * Executing the receiving half of the handshake. After receiving, moves to
+	 * SS_VOTING.
 	 */
 	SS_HANDSHAKE_RECV,
+
 	/*
-	 * Currently participating in voting, but a quorum hasn't yet been reached. Idle state. May be
-	 * paired with:
-	 *   - SPOLL_IDLE
+	 * Currently participating in voting, but a quorum hasn't yet been reached.
+	 * This is an idle state - we do not expect AdvancePollState to be called.
 	 *
-	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of SS_HANDSHAKE_RECV.
+	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of
+	 * SS_HANDSHAKE_RECV.
 	 */
 	SS_VOTING,
 	/*
-	 * Currently sending the assigned vote
+	 * Performs a blocking send of the assigned vote, then immediately moves to
+	 * SS_WAIT_VERDICT.
 	 */
 	SS_SEND_VOTE,
 	/*
-	 * Sent voting information, waiting to receive confirmation from the node. May be paired with:
-	 *   - SPOLL_WRITE_PQ_FLUSH
-	 *
-	 * After receiving, moves to SS_IDLE.
+	 * Already sent voting information, waiting to receive confirmation from the
+	 * node. After receiving, moves to SS_IDLE.
 	 */
 	SS_WAIT_VERDICT,
+
 	/*
-	 * Waiting for quorum to send WAL. Idle state. May be paired with:
-	 *  - SPOLL_IDLE
+	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
+	 * read-ready, the connection has been closed.
 	 *
 	 * Moves to SS_SEND_WAL only by calls to SendMessageToNode.
 	 */
 	SS_IDLE,
 	/*
-	 * Currently sending the message at currMsg. This state is only ever reached through calls to
-	 * SendMessageToNode. May be paired with:
-	 *   - SPOLL_WRITE_PQ_FLUSH
-	 *   - SPOLL_NONE
+	 * Start sending the message at currMsg. This state is only ever reached
+	 * through calls to SendMessageToNode.
 	 *
-	 * After sending, moves to SS_RECV_FEEDBACK.
+	 * Sending needs to flush; immediately moves to SS_SEND_WAL_FLUSH.
 	 */
 	SS_SEND_WAL,
 	/*
-	 * Currently reading feedback from sending the WAL. May be paired with:
-	 *   - SPOLL_PQ_CONSUME_AND_RETRY
-	 *   - SPOLL_NONE
+	 * Flush the WAL message, repeated until successful. On success, moves to
+	 * SS_RECV_FEEDBACK.
+	 */
+	SS_SEND_WAL_FLUSH,
+	/*
+	 * Currently reading feedback from sending the WAL.
 	 *
 	 * After reading, moves to (SS_SEND_WAL or SS_IDLE) by calls to
 	 * SendMessageToNode.
@@ -172,86 +176,6 @@ typedef enum
 	SS_RECV_FEEDBACK,
 } WalKeeperState;
 
-/* WAL safekeeper state - individual level
- *
- * This type encompasses the type of polling necessary to move on to the
- * next `WalKeeperState` from the current. It's things like "we need to
- * call PQflush some more", or "retry the current operation".
- */
-typedef enum
-{
-	/*
-	 * The current state is the one we want to be in; we just haven't run
-	 * the code for it. It should be processed with AdvancePollState to
-	 * start to advance to the next state.
-	 *
-	 * Expected WKSockWaitKind: WANTS_NO_WAIT.
-	 *
-	 * Note! This polling state is different from the others: its attached
-	 * WalKeeperState is what *will* be executed, not what just was.
-	 */
-	SPOLL_NONE,
-	/*
-	 * We need to retry the operation once the socket permits it
-	 *
-	 * Expected WKSockWaitKind: Any of WANTS_SOCK_READ, WANTS_SOCK_WRITE,
-	 * WANTS_SOCK_EITHER -- operation dependent.
-	 */
-	SPOLL_RETRY,
-	/*
-	 * Marker for states that do not expect to be advanced by calls to AdvancePollState. Not to be
-	 * confused with SS_IDLE, which carries a different (but related) meaning.
-	 *
-	 * For this polling state, we interpret any read-readiness on the socket as an indication that
-	 * the connection has closed normally.
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_READ
-	 */
-	SPOLL_IDLE,
-	/*
-	 * We need to repeat calls to PQconnectPoll. This is only available for
-	 * SS_CONNECTING
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_READ or WANTS_SOCK_WRITE
-	 */
-	SPOLL_CONNECT,
-	/* Poll with PQflush, finishing up a call to WritePGAsync. Always
-	 * combined with writing states, like SS_HANDSHAKE_SEND or SS_SEND_WAL.
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_EITHER
-	 */
-	SPOLL_WRITE_PQ_FLUSH,
-	/*
-	 * Get input with PQconsumeInput and try the operation again. This is
-	 * always combined with reading states -- like SS_HANDSHAKE_RECV or
-	 * SS_WAIT_VERDICT, and the operation repetition helps to reduce the
-	 * amount of repeated logic.
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_READ
-	 */
-	SPOLL_PQ_CONSUME_AND_RETRY,
-} WalKeeperPollState;
-
-/* The state of the socket that we're waiting on. This is used to
- * double-check for polling that the socket we're being handed is correct.
- *
- * Used in the sockWaitState field of WalKeeper, in combination with the
- * WalKeeperPollState.
- *
- * Each polling state above lists the set of values that they accept. */
-typedef enum
-{
-	/* No waiting is required for the poll state */
-	WANTS_NO_WAIT,
-	/* Polling should resume only once the socket is ready for reading */
-	WANTS_SOCK_READ,
-	/* Polling should resume only once the socket is ready for writing */
-	WANTS_SOCK_WRITE,
-	/* Polling should resume once the socket is ready for reading or
-	 * writing */
-	WANTS_SOCK_EITHER,
-} WKSockWaitKind;
-
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
@@ -379,15 +303,19 @@ typedef struct WalKeeper
 	char const*        host;
 	char const*        port;
 	char               conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
-	WalProposerConn*   conn;          /* postgres protocol connection to the walreceiver */
+
+	/*
+	 * postgres protocol connection to the WAL acceptor
+	 *
+	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
+	 * reach SS_SEND_WAL; not before.
+	 */
+	WalProposerConn*   conn;
 
 	WalMessage*        currMsg;       /* message been send to the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
 	WalKeeperState     state;         /* walkeeper state machine state */
-	WalKeeperPollState pollState;     /* what kind of polling is necessary to advance `state` */
-	WKSockWaitKind     sockWaitState; /* what state are we expecting the socket to be in for
-									     the polling required? */
 	AcceptorGreeting   greet;         /* acceptor greeting  */
 	VoteResponse	   voteResponse;  /* the vote */
 	AppendResponse  feedback;      /* feedback to master */
@@ -395,9 +323,10 @@ typedef struct WalKeeper
 
 
 int        CompareLsn(const void *a, const void *b);
-uint32     WaitKindAsEvents(WKSockWaitKind wait_kind);
 char*      FormatWalKeeperState(WalKeeperState state);
-char*      FormatWKSockWaitKind(WKSockWaitKind wait_kind);
+void       AssertEventsOkForState(uint32 events, WalKeeper* wk);
+uint32     WalKeeperStateDesiredEvents(WalKeeperState state);
+bool       StateShouldImmediatelyExecute(WalKeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
@@ -442,8 +371,8 @@ typedef enum
 	 *
 	 * Do not expect PQerrorMessage to be appropriately set. */
 	WP_EXEC_UNEXPECTED_SUCCESS,
-	/* No result available at this time. Wait until read-ready, call PQconsumeInput, then try again.
-	 * Internally, this is returned when PQisBusy indicates that PQgetResult would block. */
+	/* No result available at this time. Wait until read-ready, then call again. Internally, this is
+	 * returned when PQisBusy indicates that PQgetResult would block. */
 	WP_EXEC_NEEDS_INPUT,
 	/* Catch-all failure. Check PQerrorMessage. */
 	WP_EXEC_FAILED,
@@ -476,23 +405,17 @@ typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
 /* Re-exported PQconectPoll */
 typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
 
-/* Re-exported PQsendQuery */
+/* Blocking wrapper around PQsendQuery */
 typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
 
-/* Wrapper around PQisBusy + PQgetResult */
+/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
 typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
 
-/* Re-exported PQsetnonblocking */
-typedef int (*walprop_set_nonblocking_fn) (WalProposerConn* conn, int arg);
-
 /* Re-exported PQsocket */
 typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
 
-/* Re-exported PQflush */
-typedef int (*walprop_flush_fn) (WalProposerConn* conn);
-
-/* Re-exported PQconsumeInput */
-typedef int (*walprop_consume_input_fn) (WalProposerConn* conn);
+/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
+typedef int (*walprop_flush_fn) (WalProposerConn* conn, bool socket_read_ready);
 
 /* Re-exported PQfinish */
 typedef void (*walprop_finish_fn) (WalProposerConn* conn);
@@ -507,9 +430,9 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn);
  * protocol with the walkeepers, so it should not be used as-is for any
  * other purpose.
  *
- * Note: If possible, using <ReadPGAsyncIntoValue> is generally preferred,
- * because it performs a bit of extra checking work that's always required
- * and is normally somewhat verbose.
+ * Note: If possible, using <AsyncRead> is generally preferred, because it
+ * performs a bit of extra checking work that's always required and is normally
+ * somewhat verbose.
  */
 typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
 													char** buf,
@@ -526,6 +449,13 @@ typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
 													  void const* buf,
 													  size_t size);
 
+/*
+ * Blocking equivalent to walprop_async_write_fn
+ *
+ * Returns 'true' if successful, 'false' on failure.
+ */
+typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size);
+
 /* All libpqwalproposer exported functions collected together. */
 typedef struct WalProposerFunctionsType
 {
@@ -535,13 +465,12 @@ typedef struct WalProposerFunctionsType
 	walprop_connect_poll_fn		walprop_connect_poll;
 	walprop_send_query_fn		walprop_send_query;
 	walprop_get_query_result_fn	walprop_get_query_result;
-	walprop_set_nonblocking_fn  walprop_set_nonblocking;
 	walprop_socket_fn			walprop_socket;
 	walprop_flush_fn			walprop_flush;
-	walprop_consume_input_fn	walprop_consume_input;
 	walprop_finish_fn			walprop_finish;
 	walprop_async_read_fn		walprop_async_read;
 	walprop_async_write_fn		walprop_async_write;
+	walprop_blocking_write_fn   walprop_blocking_write;
 } WalProposerFunctionsType;
 
 /* Allow the above functions to be "called" with normal syntax */
@@ -561,16 +490,16 @@ typedef struct WalProposerFunctionsType
 	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
 #define walprop_socket(conn) \
 	WalProposerFunctions->walprop_socket(conn)
-#define walprop_flush(conn) \
-	WalProposerFunctions->walprop_flush(conn)
-#define walprop_consume_input(conn) \
-	WalProposerFunctions->walprop_consume_input(conn)
+#define walprop_flush(conn, consume_input) \
+	WalProposerFunctions->walprop_flush(conn, consume_input)
 #define walprop_finish(conn) \
 	WalProposerFunctions->walprop_finish(conn)
 #define walprop_async_read(conn, buf, amount) \
 	WalProposerFunctions->walprop_async_read(conn, buf, amount)
 #define walprop_async_write(conn, buf, size) \
 	WalProposerFunctions->walprop_async_write(conn, buf, size)
+#define walprop_blocking_write(conn, buf, size) \
+	WalProposerFunctions->walprop_blocking_write(conn, buf, size)
 
 /*
  * The runtime location of the libpqwalproposer functions.

From 0737310771b36bdd5de0997b1532956753ecba5a Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Tue, 31 Aug 2021 22:15:21 +0300
Subject: [PATCH 041/214] pass tenant id in connection string to safekeeper

---
 src/backend/replication/walproposer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 1bbe5f30b3a..d612e33331d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -196,8 +196,8 @@ ResetConnection(int i)
 	if (wk->conninfo[0] == '\0')
 	{
 		sprintf((char*) &wk->conninfo,
-				"host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
-				wk->host, wk->port, zenith_timeline_walproposer);
+				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	}
 
 	wk->conn = walprop_connect_start((char*) &wk->conninfo);
@@ -658,8 +658,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 
-	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
-			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer);
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{

From 98dc4c0523e4cbe7c3fabcccb5e9c1541ed7fb1a Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 31 Aug 2021 18:13:11 +0300
Subject: [PATCH 042/214] Ask pageserver only with LSN's aligned on record
 boundary.

Now pageserver tracks only last_record_lsn and ignores
last_valids_lsn. We can cause deadlock at start or extreme slowness
during the normal work if we call get_page with LSN of incomplete
record.

Patch by @knizhnik
---
 contrib/zenith/pagestore_smgr.c   | 24 ++++++------------------
 src/backend/access/transam/xlog.c |  1 +
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 5db79710d68..569d1c330d8 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -417,7 +417,6 @@ static XLogRecPtr
 zenith_get_request_lsn(bool nonrel)
 {
 	XLogRecPtr lsn;
-	XLogRecPtr flushlsn;
 
 	if (RecoveryInProgress())
 	{
@@ -434,12 +433,12 @@ zenith_get_request_lsn(bool nonrel)
 	}
 	else if (nonrel)
 	{
-		lsn = GetFlushRecPtr();
-		elog(DEBUG1, "zenith_get_request_lsn norel GetFlushRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
+		lsn = GetLastImportantRecPtr();
+		elog(DEBUG1, "zenith_get_request_lsn norel GetLastImportantRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
 	else
 	{
-		flushlsn = GetFlushRecPtr();
+		XLogRecPtr flushlsn;
 
 		/*
 		 * Use the latest LSN that was evicted from the buffer cache. Any
@@ -447,29 +446,18 @@ zenith_get_request_lsn(bool nonrel)
 		 * so our request cannot concern those.
 		 */
 		lsn = GetLastWrittenPageLSN();
+		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			(uint32) ((lsn) >> 32), (uint32) (lsn));
 
-		if (lsn == InvalidXLogRecPtr)
-		{
-			/*
-			 * We haven't evicted anything yet since the server was
-			 * started. Then just use the latest flushed LSN. That's always
-			 * safe, using the latest evicted LSN is really just an
-			 * optimization.
-			 */
-			lsn = flushlsn;
-			elog(DEBUG1, "zenith_get_request_lsn GetFlushRecPtr lsn %X/%X",
-				 (uint32) ((lsn) >> 32), (uint32) (lsn));
-		}
-		else
-			lsn = zm_adjust_lsn(lsn);
+		lsn = zm_adjust_lsn(lsn);
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
 		 * we shouldn't evict a page from the buffer cache before all its modifications have
 		 * been safely flushed. That's the "WAL before data" rule. But better safe than sorry.
 		 */
+		flushlsn = GetFlushRecPtr();
 		if (lsn > flushlsn)
 		{
 			elog(LOG, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 651e65b125a..768dbefe32f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7981,6 +7981,7 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
+	XLogCtl->lastWrittenPageLSN = EndOfLog;
 
 	LocalSetXLogInsertAllowed();
 

From 671838238a2642c4a08a4ce8796681b82ae3975d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 1 Sep 2021 23:17:17 +0300
Subject: [PATCH 043/214] [refer #506] Correctly initialize all fields of WAL
 page header for first WAL record of started compute node

---
 src/backend/access/transam/xlog.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 768dbefe32f..2eee4a9dbdc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7070,6 +7070,10 @@ StartupXLOG(void)
 			EndRecPtr = RecPtr = checkPoint.redo;
 			skipLastRecordReread = true;
 			close(fd);
+
+			elog(LOG,
+				"[ZENITH] found 'zenith.signal' file. Setting prevRecPtr to %X/%X",
+				LSN_FORMAT_ARGS(prevRecPtr));
 		}
 		else
 		{
@@ -7749,11 +7753,15 @@ StartupXLOG(void)
 	 */
 	if (skipLastRecordReread)
 	{
-		XLogRecPtr lastPage = EndRecPtr - (EndRecPtr % XLOG_BLCKSZ);
+		int offs = (EndRecPtr % XLOG_BLCKSZ);
+		XLogRecPtr lastPage = EndRecPtr - offs;
 		int idx = XLogRecPtrToBufIdx(lastPage);
 		XLogPageHeader xlogPageHdr = (XLogPageHeader)(XLogCtl->pages + idx*XLOG_BLCKSZ);
 		xlogPageHdr->xlp_pageaddr = lastPage;
 		xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+		xlogPageHdr->xlp_tli = ThisTimeLineID;
+		xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD;
+		xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
 		readOff = XLogSegmentOffset(lastPage, wal_segment_size);
 		elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
 	}

From bee24fe894e20c95ad0a0bed2fabde6f361f72dc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 19 Aug 2021 19:55:38 +0300
Subject: [PATCH 044/214] Add --sync-safekeepers starting standalone
 walproposer to sync safekeepers (#439).

It is intended to solve the following problems:

a) Chicken-or-the-egg one: compute postgres needs data directory
   with non-rel files that are downloaded from pageserver by calling
   basebackup@LSN. This LSN is not arbitrary, it must include all
   previously committed transactions and defined through consensus
   voting, which happens... in walproposer, a part of compute node.

b) Just warranting such LSN is not enough, we must also actually commit
   it and make sure there is a safekeeper who knows this LSN is
   committed so WAL before it can be streamed to pageserver -- otherwise
   basebackup will hang waiting for WAL. Advancing commit_lsn without
   playing consensus game is impossible, so speculative 'let's just poll
   safekeepers, learn start LSN of future epoch and run basebackup'
   won't work.

Currently --sync-safekeepers is considered completed when 1) at least majority
of safekeepers and 2) *all* safekeepers with live connection to walproposer
switch to new epoch and advance commit_lsn allowing basebackup to proceed. 2)
limits availablity, but that's because currently we don't have a mechanism
defining which safekeeper should stream WAL into pageserver.
---
 src/backend/main/main.c               |   3 +
 src/backend/replication/walproposer.c | 302 +++++++++++++++++++++-----
 src/include/replication/walproposer.h |   9 +-
 3 files changed, 254 insertions(+), 60 deletions(-)

diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index aa74591e222..8ca0b7f57fa 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -35,6 +35,7 @@
 #include "common/username.h"
 #include "port/atomics.h"
 #include "postmaster/postmaster.h"
+#include "replication/walproposer.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/help_config.h"
@@ -209,6 +210,8 @@ main(int argc, char *argv[])
 		WalRedoMain(argc, argv,
 					 NULL,		/* no dbname */
 					 strdup(get_user_name_or_exit(progname)));	/* does not return */
+	else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0)
+		WalProposerSync(argc, argv);
 	else
 		PostmasterMain(argc, argv); /* does not return */
 	abort();					/* should not get here */
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d612e33331d..06b227244f9 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -2,7 +2,36 @@
  *
  * walproposer.c
  *
- * Broadcast WAL stream to Zenith WAL acceptetors
+ * Proposer/leader part of the total order broadcast protocol between postgres
+ * and WAL safekeepers.
+ *
+ * We have two ways of launching WalProposer:
+ *
+ *   1. As a background worker which will run physical WalSender with
+ *      am_wal_proposer flag set to true. WalSender in turn would handle WAL
+ *      reading part and call WalProposer when ready to scatter WAL.
+ *
+ *   2. As a standalone utility by running `postgres --sync-safekeepers`. That
+ *      is needed to create LSN from which it is safe to start postgres. More
+ *      specifically it addresses following problems:
+ *
+ *      a) Chicken-or-the-egg problem: compute postgres needs data directory
+ *         with non-rel files that are downloaded from pageserver by calling
+ *         basebackup@LSN. This LSN is not arbitrary, it must include all
+ *         previously committed transactions and defined through consensus
+ *         voting, which happens... in walproposer, a part of compute node.
+ *
+ *      b) Just warranting such LSN is not enough, we must also actually commit
+ *         it and make sure there is a safekeeper who knows this LSN is
+ *         committed so WAL before it can be streamed to pageserver -- otherwise
+ *         basebackup will hang waiting for WAL. Advancing commit_lsn without
+ *         playing consensus game is impossible, so speculative 'let's just poll
+ *         safekeepers, learn start LSN of future epoch and run basebackup'
+ *         won't work.
+ *
+ *      TODO: check that LSN on safekeepers after start is the same as it was
+ *            after `postgres --sync-safekeepers`.
+ *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
@@ -18,6 +47,7 @@
 #include "replication/walreceiver.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
+#include "postmaster/postmaster.h"
 #include "storage/pmsignal.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
@@ -29,7 +59,6 @@ char* wal_acceptors_list;
 int   wal_acceptor_reconnect_timeout;
 bool  am_wal_proposer;
 
-
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
 WalProposerFunctionsType* WalProposerFunctions = NULL;
 
@@ -45,7 +74,11 @@ static XLogRecPtr	lastSentCommitLsn;	/* last commitLsn broadcast to walkeepers *
 static ProposerGreeting   proposerGreeting;
 static WaitEventSet* waitEvents;
 static AppendResponse lastFeedback;
-static XLogRecPtr   truncateLsn; /* Last position received by all walkeepers. */
+/*
+ *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
+ *  + 1 of last chunk streamed to everyone)
+ */
+static XLogRecPtr   truncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t       propTerm; /* term of the proposer */
 static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
@@ -55,6 +88,9 @@ static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
 
+/* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
+static bool         syncSafekeepers;
+
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
 static bool AsyncRead(int i, void* value, size_t value_size);
@@ -62,6 +98,9 @@ static bool BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState succ
 static bool AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
 static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
 static void HackyRemoveWalProposerEvent(int to_remove);
+static WalMessage* CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void BroadcastMessage(WalMessage* msg);
+
 
 /*
  * Combine hot standby feedbacks from all walkeepers.
@@ -277,23 +316,27 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr minQuorumLsn;
+	int i;
+	int n_synced;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
 	{
 		lastFeedback.flushLsn = minQuorumLsn;
 		/* advance the replication slot */
-		ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+		if (!syncSafekeepers)
+			ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
 	}
 	CombineHotStanbyFeedbacks(&hsFeedback);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
 	{
 		lastFeedback.hs = hsFeedback;
-		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+		if (!syncSafekeepers)
+			ProcessStandbyHSFeedback(hsFeedback.ts,
+									 XidFromFullTransactionId(hsFeedback.xmin),
+									 EpochFromFullTransactionId(hsFeedback.xmin),
+									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 
 
@@ -312,26 +355,52 @@ HandleWalKeeperResponse(void)
 	}
 	if (!msgQueueHead) /* queue is empty */
 		msgQueueTail = NULL;
+
+	/*
+	 * Generally sync is done when majority switched the epoch so we committed
+	 * epochStartLsn and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is alive,
+	 * there will be at least one safekeeper who is able to stream WAL to
+	 * pageserver to make basebackup possible. However, since at the moment we
+	 * don't have any good mechanism of defining the healthy and most advanced
+	 * safekeeper who should push the wal into pageserver and basically the
+	 * random one gets connected, to prevent hanging basebackup (due to
+	 * pageserver connecting to not-synced-walkeeper) we currently wait for all
+	 * seemingly alive walkeepers to get synced.
+	 */
+	if (syncSafekeepers)
+	{
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			WalKeeper *wk = &walkeeper[i];
+			bool synced = wk->feedback.commitLsn >= propEpochStartLsn;
+
+			/* alive safekeeper which is not synced yet; wait for it */
+			if (wk->state != SS_OFFLINE && !synced)
+				return;
+			if (synced)
+				n_synced++;
+		}
+		if (n_synced >= quorum)
+		{
+			/* All walkeepers synced! */
+			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+			exit(0);
+		}
+	}
 }
 
 char *zenith_timeline_walproposer = NULL;
 char *zenith_tenant_walproposer = NULL;
 
-/*
- * WAL proposer bgworeker entry point
- */
-void
-WalProposerMain(Datum main_arg)
+
+static void
+WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 {
 	char* host;
 	char* sep;
 	char* port;
 
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
 	/* Load the libpq-specific functions */
 	load_file("libpqwalproposer", false);
 	if (WalProposerFunctions == NULL)
@@ -340,11 +409,8 @@ WalProposerMain(Datum main_arg)
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-
 	load_file("zenith", false);
 
-	BackgroundWorkerUnblockSignals();
-
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
@@ -374,14 +440,12 @@ WalProposerMain(Datum main_arg)
 	}
 	quorum = n_walkeepers/2 + 1;
 
-	GetXLogReplayRecPtr(&ThisTimeLineID);
-
 	/* Fill the greeting package */
 	proposerGreeting.tag = 'g';
 	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
 	proposerGreeting.pgVersion = PG_VERSION_NUM;
 	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
-	proposerGreeting.systemId = GetSystemIdentifier();
+	proposerGreeting.systemId = systemId;
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
@@ -395,13 +459,52 @@ WalProposerMain(Datum main_arg)
 	proposerGreeting.timeline = ThisTimeLineID;
 	proposerGreeting.walSegSize = wal_segment_size;
 
+	InitEventSet();
+}
+
+static void
+WalProposerLoop(void)
+{
+	while (true)
+		WalProposerPoll();
+}
+
+static void
+WalProposerStart(void)
+{
+
+	/* Initiate connections to all walkeeper nodes */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		ResetConnection(i);
+	}
+
+	WalProposerLoop();
+}
+
+/*
+ * WAL proposer bgworeker entry point
+ */
+void
+WalProposerMain(Datum main_arg)
+{
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
+
 	last_reconnect_attempt = GetCurrentTimestamp();
 
 	application_name = (char *) "walproposer"; /* for synchronous_standby_names */
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
-	InitEventSet();
 
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
@@ -410,14 +513,54 @@ WalProposerMain(Datum main_arg)
 		ReplicationSlotRelease();
 	}
 
-	/* Initiate connections to all walkeeper nodes */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		ResetConnection(i);
-	}
+	WalProposerStart();
+}
 
-	while (true)
-		WalProposerPoll();
+void
+WalProposerSync(int argc, char *argv[])
+{
+	syncSafekeepers = true;
+
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	WalProposerInit(0, 0);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
 }
 
 static void
@@ -458,8 +601,22 @@ SendMessageToNode(int i, WalMessage* msg)
 	/* Only try to send the message if it's non-null */
 	if (wk->currMsg)
 	{
-		wk->currMsg->req.truncateLsn = truncateLsn;
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
+		/*
+		 * truncateLsn is advanced immediately once chunk is broadcast to all
+		 * safekeepers, and commitLsn generally can't be advanced based on
+		 * feedback from safekeeper who is still in the previous epoch (similar
+		 * to 'leader can't commit entries from previous term' in Raft), so the
+		 * first might surprisingly get higher than the latter.
+		 *
+		 * Another reason for this will be switch to proper acks from
+		 * safekeepers: they must point to end of last valid record, not just
+		 * end of last received chunk.
+		 *
+		 * Free safekeeper from such surprises by holding back truncateLsn in
+		 * these cases.
+		 */
+		wk->currMsg->req.truncateLsn = Min(truncateLsn, wk->currMsg->req.commitLsn);
 
 		/* Once we've selected and set up our message, actually start sending it. */
 		wk->state = SS_SEND_WAL;
@@ -539,17 +696,11 @@ WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
  * know that commit lsn has advanced.
  */
 static WalMessage*
-CreateMessageCommitLsnOnly(void)
+CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 {
 	/* Create new message and append it to message queue */
 	WalMessage*	msg;
 
-	if (lastSentLsn == 0)
-	{
-		/* FIXME: We haven't sent anything yet. Not sure what to do then.. */
-		return NULL;
-	}
-
 	msg = (WalMessage*)malloc(sizeof(WalMessage));
 	if (msgQueueTail != NULL)
 		msgQueueTail->next = msg;
@@ -563,8 +714,18 @@ CreateMessageCommitLsnOnly(void)
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
-	msg->req.beginLsn = lastSentLsn;
-	msg->req.endLsn = lastSentLsn;
+	/*
+     * This serves two purposes:
+	 * 1) After all msgs from previous epochs are pushed we queue empty
+     *    WalMessage with lsn set to epochStartLsn which commands to switch the
+     *    epoch, which allows to do the switch without creating new epoch
+     *    records (we especially want to avoid such in --sync mode).
+	 *    Walproposer can advance commit_lsn only after the switch, so this lsn
+	 *    (reported back) also is the first possible advancement point.
+	 * 2) Maintain common invariant of queue entries sorted by LSN.
+	 */
+	msg->req.beginLsn = lsn;
+	msg->req.endLsn = lsn;
 	msg->req.proposerId = proposerGreeting.proposerId;
 	/* truncateLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
 	return msg;
@@ -602,8 +763,9 @@ DetermineEpochStartLsn(void)
 		}
 	}
 
-	elog(LOG, "got votes from majority (%d) of nodes, epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
 		 quorum,
+		 propTerm,
 		 LSN_FORMAT_ARGS(propEpochStartLsn),
 		 walkeeper[donor].host, walkeeper[donor].port,
 		 LSN_FORMAT_ARGS(truncateLsn)
@@ -682,20 +844,34 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	if (walrcv_startstreaming(wrconn, &options))
 	{
 		XLogRecPtr rec_start_lsn;
-		XLogRecPtr rec_end_lsn;
+		XLogRecPtr rec_end_lsn = 0;
 		int len;
 		char *buf;
 		pgsocket wait_fd = PGINVALID_SOCKET;
-		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) > 0)
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
 		{
-			Assert(buf[0] == 'w');
-			memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn);
-			rec_start_lsn = pg_ntoh64(rec_start_lsn);
-			rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-			(void)CreateMessage(rec_start_lsn, buf, len);
-			if (rec_end_lsn >= endpos)
-				break;
+			if (len == 0)
+			{
+				(void) WaitLatchOrSocket(
+					MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+					-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+			}
+			else
+			{
+				Assert(buf[0] == 'w');
+				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
+					   sizeof rec_start_lsn);
+				rec_start_lsn = pg_ntoh64(rec_start_lsn);
+				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+				(void) CreateMessage(rec_start_lsn, buf, len);
+				elog(DEBUG1, "Recover message %X/%X length %d",
+					 LSN_FORMAT_ARGS(rec_start_lsn), len);
+				if (rec_end_lsn >= endpos)
+					break;
+			}
 		}
+		elog(DEBUG1, "end of replication stream at %X/%X: %m",
+			 LSN_FORMAT_ARGS(rec_end_lsn));
 		walrcv_disconnect(wrconn);
 	}
 	else
@@ -1032,6 +1208,20 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
+						/* this message signifies epoch switch */
+						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+
+						if (syncSafekeepers)
+						{
+							/* Wait until all walkeepers are synced */
+							WalProposerLoop();
+						}
+					}
+					else if (syncSafekeepers)
+					{
+						/* Sync is not needed: just exit */
+						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+						exit(0);
 					}
 					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
@@ -1081,7 +1271,6 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* next;
 				XLogRecPtr  minQuorumLsn;
-				WalMessage* commitLsnUpdateMsg;
 
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
@@ -1089,7 +1278,6 @@ AdvancePollState(int i, uint32 events)
 					return;
 
 				next = wk->currMsg->next;
-				Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
 				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
 
 				wk->currMsg = NULL;
@@ -1106,9 +1294,7 @@ AdvancePollState(int i, uint32 events)
 
 				if (minQuorumLsn > lastSentCommitLsn)
 				{
-					commitLsnUpdateMsg = CreateMessageCommitLsnOnly();
-					if (commitLsnUpdateMsg)
-						BroadcastMessage(commitLsnUpdateMsg);
+					BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
 					lastSentCommitLsn = minQuorumLsn;
 				}
 				break;
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 6741e9f82dd..99e62142736 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -244,7 +244,7 @@ typedef struct AppendRequestHeader
 	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
 	/*
 	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
-	 *  + 1 of last record streamed to everyone)
+	 *  + 1 of last chunk streamed to everyone)
 	 */
     XLogRecPtr truncateLsn;
     pg_uuid_t  proposerId; /* for monitoring/debugging */
@@ -289,8 +289,11 @@ typedef struct AppendResponse
 	 */
 	uint64 tag;
 	term_t     term;
-	term_t     epoch;
+	term_t epoch;
 	XLogRecPtr flushLsn;
+	// Safekeeper reports back his awareness about which WAL is committed, as
+	// this is a criterion for walproposer --sync mode exit
+	XLogRecPtr commitLsn;
 	HotStandbyFeedback hs;
 } AppendResponse;
 
@@ -344,6 +347,8 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									TransactionId feedbackCatalogXmin,
 									uint32		feedbackCatalogEpoch);
 void       StartReplication(StartReplicationCmd *cmd);
+void       WalProposerSync(int argc, char *argv[]);
+
 
 /* libpqwalproposer hooks & helper type */
 

From 31789272541067852a63c61cf2353114ae59dc12 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 27 Aug 2021 16:02:14 +0300
Subject: [PATCH 045/214] Update Dockerfile

---
 .circleci/config.yml |  6 ++----
 Dockerfile           | 27 ++++++++++-----------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ad48e5ac396..16a271b0386 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -6,10 +6,8 @@ jobs:
   docker_image:
     docker:
       - image: cimg/base:2021.04
-    working_directory: ~/repo
     steps:
-      - checkout:
-          path: ~/repo
+      - checkout
       - setup_remote_docker:
           docker_layer_caching: true
       - run:
@@ -25,7 +23,7 @@ workflows:
       # Build and push image only for commits to `main`.
       - docker_image:
           # Context gives an ability to login
-          context: 'Docker Hub'
+          context: Docker Hub
           filters:
             branches:
               only:
diff --git a/Dockerfile b/Dockerfile
index 83407413142..4878e3cc755 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Image with pre-built tools
 #
 FROM zenithdb/compute-tools:latest AS compute-deps
-# Only to get ready apply_conf binary as a dep
+# Only to get ready zenith_ctl and apply_conf binaries as deps
 
 #
 # Image with Postgres build deps
@@ -43,32 +43,25 @@ WORKDIR /pg
 FROM debian:buster-slim
 
 # libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install openssh-server libreadline-dev && \
-    # This will prepare everything needed by sshd
-    # like generation host keys with ssh-keygen -A
-    service ssh start
+RUN apt-get update && apt-get -yq install libreadline-dev
 
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \
     mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres/compute && \
-    chown -R postgres:postgres /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres && \
     chmod 0750 /var/db/postgres/compute
 
 # Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /var/db/postgres/install
+COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
 
-# Copy apply_conf binary
+# Copy binaries from compute-tools
 COPY --from=compute-deps /usr/local/bin/apply_conf /usr/local/bin/apply_conf
+COPY --from=compute-deps /usr/local/bin/zenith_ctl /usr/local/bin/zenith_ctl
 
-# Copy postgres binaries to the common location
-RUN cp /var/db/postgres/install/bin/* /usr/local/bin/ && \
-    cp -r /var/db/postgres/install/share/* /usr/local/share/ && \
-    # Add postgres shared objects to the search path
-    echo '/var/db/postgres/install/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+# Add postgres shared objects to the search path
+RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
-# To be able to run sshd (seems to be default)
-# USER root
+USER postgres
 
-ENTRYPOINT ["/bin/sh"]
+ENTRYPOINT ["/usr/local/bin/zenith_ctl"]

From b6fd51a41be3ffeb5896b9d06c7cf07d0828f336 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 10 Sep 2021 13:22:08 +0300
Subject: [PATCH 046/214] Fix compiler warnings in walproposer.c

---
 src/backend/replication/walproposer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 06b227244f9..bee12ad21b7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -316,8 +316,7 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr minQuorumLsn;
-	int i;
-	int n_synced;
+	int n_synced = 0;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)

From 6861fa67ba5765562700ebefd0fee8991293a9ae Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 9 Sep 2021 17:30:57 +0300
Subject: [PATCH 047/214] Always advance truncateLsn to commitLsn, keeping it
 on record boundary.

And take initial value from freshly created slot position. Thus proposer always
starts streaming from the record beginning; it simplifies WAL decoding on
safekeeper.
---
 src/backend/replication/walproposer.c | 88 ++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index bee12ad21b7..0532fb47025 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -79,6 +79,7 @@ static AppendResponse lastFeedback;
  *  + 1 of last chunk streamed to everyone)
  */
 static XLogRecPtr   truncateLsn;
+static XLogRecPtr   candidateTruncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t       propTerm; /* term of the proposer */
 static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
@@ -344,10 +345,30 @@ HandleWalKeeperResponse(void)
 	{
 		WalMessage* msg = msgQueueHead;
 		msgQueueHead = msg->next;
-		if (truncateLsn < msg->req.beginLsn)
+		/*
+		 * This piece is received by everyone; try to advance truncateLsn, but
+		 * hold it back to nearest commitLsn. Thus we will always start
+		 * streaming from the beginning of the record, which simplifies decoding
+		 * on the far end.
+		 *
+		 * This also prevents surprising violation of truncateLsn <= commitLsn
+		 * invariant which might occur because 1) truncateLsn can be advanced
+		 * immediately once chunk is broadcast to all safekeepers, and commitLsn
+		 * generally can't be advanced based on feedback from safekeeper who is
+		 * still in the previous epoch (similar to 'leader can't commit entries
+		 * from previous term' in Raft); 2) chunks we read from WAL and send are
+		 * plain sheets of bytes, but safekeepers ack only on commit boundaries.
+		 */
+		if (msg->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
+		{
+			truncateLsn = minQuorumLsn;
+			candidateTruncateLsn = InvalidXLogRecPtr;
+		}
+		else if (msg->req.endLsn >= candidateTruncateLsn &&
+				 candidateTruncateLsn != InvalidXLogRecPtr)
 		{
-			Assert(truncateLsn < msg->req.endLsn);
-			truncateLsn = msg->req.endLsn;
+			truncateLsn = candidateTruncateLsn;
+			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
@@ -509,6 +530,10 @@ WalProposerMain(Datum main_arg)
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
 		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotReserveWal();
+		/* Write this slot to disk */
+		ReplicationSlotMarkDirty();
+		ReplicationSlotSave();
 		ReplicationSlotRelease();
 	}
 
@@ -601,21 +626,7 @@ SendMessageToNode(int i, WalMessage* msg)
 	if (wk->currMsg)
 	{
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
-		/*
-		 * truncateLsn is advanced immediately once chunk is broadcast to all
-		 * safekeepers, and commitLsn generally can't be advanced based on
-		 * feedback from safekeeper who is still in the previous epoch (similar
-		 * to 'leader can't commit entries from previous term' in Raft), so the
-		 * first might surprisingly get higher than the latter.
-		 *
-		 * Another reason for this will be switch to proper acks from
-		 * safekeepers: they must point to end of last valid record, not just
-		 * end of last received chunk.
-		 *
-		 * Free safekeeper from such surprises by holding back truncateLsn in
-		 * these cases.
-		 */
-		wk->currMsg->req.truncateLsn = Min(truncateLsn, wk->currMsg->req.commitLsn);
+		wk->currMsg->req.truncateLsn = truncateLsn;
 
 		/* Once we've selected and set up our message, actually start sending it. */
 		wk->state = SS_SEND_WAL;
@@ -741,10 +752,9 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 static void
 DetermineEpochStartLsn(void)
 {
-	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
-	propEpochStartLsn = wal_segment_size;
+	propEpochStartLsn = InvalidXLogRecPtr;
 	donorEpoch = 0;
-	truncateLsn = wal_segment_size;
+	truncateLsn = InvalidXLogRecPtr;
 
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -762,7 +772,28 @@ DetermineEpochStartLsn(void)
 		}
 	}
 
-	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
+	/*
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
+	 * committed yet. To keep the idea of always starting streaming since record
+	 * boundary (which simplifies decoding on safekeeper), take start position
+	 * of the slot.
+	 */
+	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
+	{
+		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, SAB_Error);
+		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		ReplicationSlotRelease();
+		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
+	}
+	/*
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to some
+	 * connected safekeeper; it must have carried truncateLsn pointing to the
+	 * first record.
+	 */
+	Assert((truncateLsn != InvalidXLogRecPtr) ||
+		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
+
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		 quorum,
 		 propTerm,
 		 LSN_FORMAT_ARGS(propEpochStartLsn),
@@ -1240,8 +1271,9 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* msg = wk->currMsg;
 
-				elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
+				elog(LOG, "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
 					 msg->size - sizeof(AppendRequestHeader),
+					 LSN_FORMAT_ARGS(msg->req.beginLsn),
 					 LSN_FORMAT_ARGS(msg->req.commitLsn),
 					 LSN_FORMAT_ARGS(truncateLsn),
 					 wk->host, wk->port);
@@ -1294,6 +1326,16 @@ AdvancePollState(int i, uint32 events)
 				if (minQuorumLsn > lastSentCommitLsn)
 				{
 					BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+					/*
+					 * commitLsn is always the record boundary; remember it so
+					 * we can advance truncateLsn there. But do so only if
+					 * previous value is applied, otherwise it might never catch
+					 * up.
+					 */
+					if (candidateTruncateLsn == InvalidXLogRecPtr)
+					{
+						candidateTruncateLsn = minQuorumLsn;
+					}
 					lastSentCommitLsn = minQuorumLsn;
 				}
 				break;

From 559c2a13d562fe34bbcfa0a930581ead6f048516 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 10 Sep 2021 21:21:21 +0300
Subject: [PATCH 048/214] Minor logging editing.

---
 src/backend/replication/walproposer.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 0532fb47025..02051e627f7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -861,10 +861,11 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 						err)));
 		return false;
 	}
-	elog(LOG, "Start recovery from %s:%s starting from %X/%08X till %X/%08X timeline %d",
-		 walkeeper[donor].host, walkeeper[donor].port,
-		 (uint32)(startpos>>32), (uint32)startpos, (uint32)(endpos >> 32), (uint32)endpos,
-		 timeline);
+	elog(LOG,
+		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
 	options.startpoint = startpos;
@@ -1017,7 +1018,8 @@ AdvancePollState(int i, uint32 events)
 				switch (result)
 				{
 					case WP_CONN_POLLING_OK:
-						elog(LOG, "Connected with node %s:%s", wk->host, wk->port);
+						elog(LOG, "connected with node %s:%s", wk->host,
+							 wk->port);
 
 						/* Once we're fully connected, we can move to the next state */
 						wk->state = SS_EXEC_STARTWALPUSH;
@@ -1233,8 +1235,11 @@ AdvancePollState(int i, uint32 events)
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
 					if (truncateLsn < propEpochStartLsn)
 					{
-						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to epochStartLsn=%X/%X",
-							 LSN_FORMAT_ARGS(truncateLsn), LSN_FORMAT_ARGS(propEpochStartLsn));
+						elog(LOG,
+							 "start recovery because truncateLsn=%X/%X is not "
+							 "equal to epochStartLsn=%X/%X",
+							 LSN_FORMAT_ARGS(truncateLsn),
+							 LSN_FORMAT_ARGS(propEpochStartLsn));
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
@@ -1271,12 +1276,13 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* msg = wk->currMsg;
 
-				elog(LOG, "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
+				elog(LOG,
+					 "sending message with len %ld beginLsn=%X/%X "
+					 "commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 					 msg->size - sizeof(AppendRequestHeader),
 					 LSN_FORMAT_ARGS(msg->req.beginLsn),
 					 LSN_FORMAT_ARGS(msg->req.commitLsn),
-					 LSN_FORMAT_ARGS(truncateLsn),
-					 wk->host, wk->port);
+					 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
 				/* We write with msg->size here because the body of the message
 				 * is stored after the end of the WalMessage struct, in the

From 3414506567f4878a05e3779d41244d338495ed82 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 10 Sep 2021 21:21:45 +0300
Subject: [PATCH 049/214] Fix walproposer starting streaming point.

Send *all* entries (from the beginning, i.e. truncateLsn) to everyone but donor
who doesn't need recovery at all and will receive only new entries. This can be
optimized to avoid sending data which is already persisted (and correct), but
previous such optimization was incorrect.
---
 src/backend/replication/walproposer.c | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 02051e627f7..f8aed25255b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -912,23 +912,23 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 						timeline, (uint32)(startpos >> 32), (uint32)startpos)));
 		return false;
 	}
-	/* Setup restart point for all walkeepers */
+
+	/*
+	 * Start sending entries to everyone from the beginning (truncateLsn),
+	 * except for donor who doesn't need recovery at all. We could do here
+	 * better, taking into account commitLsn of safekeepers to avoid sending
+	 * them excessive data, but this requires some effort (note also that we
+	 * must always start sending from the beginning of the record).
+	 *
+	 * And note that we definitely can't pick up flushLsn of safekeeper and
+	 * decide he already has everything before, as such WAL is generally
+	 * entirely different than the correct (donor) one.
+	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE)
+		if (walkeeper[i].state == SS_IDLE && i != donor)
 		{
-			for (WalMessage* msg = msgQueueHead; msg != NULL; msg = msg->next)
-			{
-				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
-				{
-					msg->ackMask |= 1 << i; /* message is already received by this walkeeper */
-				}
-				else
-				{
-					SendMessageToNode(i, msg);
-					break;
-				}
-			}
+			SendMessageToNode(i, msgQueueHead);
 		}
 	}
 	return true;

From dea852c1299aa12e56bb8dc3bdee3954b865badf Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 14 Sep 2021 16:41:35 +0300
Subject: [PATCH 050/214] Mark all recovery messages as received by the donor.

I forgot to do that in 42316a81d3. Fixes segfault related to attempt to send the
(garbage collected) message second time and queue advancement when donor doesn't
restart.
---
 src/backend/replication/walproposer.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index f8aed25255b..5deb83579d6 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -613,9 +613,8 @@ SendMessageToNode(int i, WalMessage* msg)
 	/* we shouldn't be already sending something */
 	Assert(wk->currMsg == NULL);
 	/*
-	 * Skip already acknowledged messages. Used during start to get to the
-	 * first not yet received message. Otherwise we always just send
-	 * 'msg'.
+	 * Skip already acknowledged messages. Used after reconnection to get to the
+	 * first not yet sent message. Otherwise we always just send 'msg'.
 	 */
 	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 		msg = msg->next;
@@ -931,6 +930,11 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 			SendMessageToNode(i, msgQueueHead);
 		}
 	}
+	/* Mark all recovery messages as already received by the donor. */
+	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
+	{
+		msg->ackMask |= 1 << donor;
+	}
 	return true;
 }
 
@@ -1243,7 +1247,12 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
-						/* this message signifies epoch switch */
+						/*
+						 * This message signifies epoch switch; it is needed to
+						 * make the switch happen on donor, as he won't get any
+						 * other messages until we start writing new WAL (and we
+						 * e.g. don't in --sync mode at all)
+						 */
 						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
 						if (syncSafekeepers)

From ff595e33034ef0cbad95fa877bccbd8aa068cf7e Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 14 Sep 2021 17:41:49 +0300
Subject: [PATCH 051/214] Don't FATAL in walproposer when EOF arrives in
 SS_IDLE state.

---
 src/backend/replication/walproposer.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5deb83579d6..2ae1e4d86aa 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -224,8 +224,6 @@ ResetConnection(int i)
 
 	if (wk->state != SS_OFFLINE)
 	{
-		elog(WARNING, "Connection with node %s:%s in %s state failed",
-			wk->host, wk->port, FormatWalKeeperState(wk->state));
 		ShutdownConnection(i);
 	}
 
@@ -1187,9 +1185,10 @@ AdvancePollState(int i, uint32 events)
 			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
 			 * SS_SEND_VOTE. */
 			case SS_VOTING:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting",
-					 wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(i);
+				break;
 
 			/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
@@ -1276,8 +1275,10 @@ AdvancePollState(int i, uint32 events)
 			/* Idle state for sending WAL. Moved out only by calls to
 			 * SendMessageToNode */
 			case SS_IDLE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(i);
+				break;
 
 			/* Start to send the message at wk->currMsg. Triggered only by calls
 			 * to SendMessageToNode */

From f82b6eb75a47bce41e3e20476878dab2d9f46322 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 15 Sep 2021 15:13:14 +0300
Subject: [PATCH 052/214] Optimize walproposer starting streaming point.

Safekeepers who are in the same epoch as donor definitely have correct WAL, so
we can send to them since their flushLsn. This required some additionall fuss
due to convention of always starting streaming at the record boundary.
---
 src/backend/replication/walproposer.c | 94 ++++++++++++++++++++-------
 src/include/replication/walproposer.h |  6 ++
 2 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 2ae1e4d86aa..d5e9730c526 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -368,6 +368,11 @@ HandleWalKeeperResponse(void)
 			truncateLsn = candidateTruncateLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			if (msg->perSafekeeper[i])
+				free(msg->perSafekeeper[i]);
+		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
@@ -677,6 +682,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
 	msg->ackMask = 0;
+	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -718,6 +724,7 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
 	msg->ackMask = 0;
+	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -912,26 +919,54 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 
 	/*
 	 * Start sending entries to everyone from the beginning (truncateLsn),
-	 * except for donor who doesn't need recovery at all. We could do here
-	 * better, taking into account commitLsn of safekeepers to avoid sending
-	 * them excessive data, but this requires some effort (note also that we
-	 * must always start sending from the beginning of the record).
-	 *
-	 * And note that we definitely can't pick up flushLsn of safekeeper and
-	 * decide he already has everything before, as such WAL is generally
-	 * entirely different than the correct (donor) one.
+	 * except for those who lives in donor's epoch and thus for sure has correct
+	 * WAL. We could do here even slightly better, taking into account commitLsn
+	 * of the rest to avoid sending them excessive data.
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE && i != donor)
+		if (walkeeper[i].state != SS_IDLE)
+			continue;
+
+		if (walkeeper[i].voteResponse.epoch != donorEpoch)
 		{
 			SendMessageToNode(i, msgQueueHead);
 		}
-	}
-	/* Mark all recovery messages as already received by the donor. */
-	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-	{
-		msg->ackMask |= 1 << donor;
+		else
+		{
+			for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
+			{
+				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
+				{
+					/* message is already received by this walkeeper */
+					msg->ackMask |= 1 << i;
+				}
+				else
+				{
+					uint32 len;
+					uint32 size;
+
+					/*
+					 * By convention we always stream since the beginning of the
+					 * record, and flushLsn points to it -- form the message
+					 * starting there.
+					 */
+					len = msg->req.endLsn - walkeeper[i].voteResponse.flushLsn;
+					size = sizeof(AppendRequestHeader) + len;
+					msg->perSafekeeper[i] = malloc(size);
+					*msg->perSafekeeper[i] = msg->req;
+					msg->perSafekeeper[i]->beginLsn =
+						walkeeper[i].voteResponse.flushLsn;
+					memcpy(&msg->perSafekeeper[i] + 1,
+						   (char *) (&msg->req + 1) +
+							   walkeeper[i].voteResponse.flushLsn -
+							   msg->req.beginLsn,
+						   len);
+					SendMessageToNode(i, msg);
+					break;
+				}
+			}
+		}
 	}
 	return true;
 }
@@ -1206,11 +1241,17 @@ AdvancePollState(int i, uint32 events)
 				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
+				elog(LOG,
+					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+					 wk->host, wk->port, wk->voteResponse.voteGiven, wk->voteResponse.epoch,
+					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
+					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+
 				/*
-				 * In case of acceptor rejecting our vote, bail out, but only if
-				 * either it already lives in strictly higher term (concurrent
-				 * compute spotted) or we are not elected yet and thus need the
-				 * vote.
+				 * In case of acceptor rejecting our vote, bail out, but only
+				 * if either it already lives in strictly higher term
+				 * (concurrent compute spotted) or we are not elected yet and
+				 * thus need the vote.
 				 */
 				if ((!wk->voteResponse.voteGiven) &&
 					(wk->voteResponse.term > propTerm || n_votes < quorum))
@@ -1285,19 +1326,26 @@ AdvancePollState(int i, uint32 events)
 			case SS_SEND_WAL:
 			{
 				WalMessage* msg = wk->currMsg;
+				AppendRequestHeader *req = &msg->req;
+
+				/* if there is a message specially crafted for this safekeeper, send it */
+				if (msg->perSafekeeper[i])
+					req = msg->perSafekeeper[i];
 
 				elog(LOG,
-					 "sending message with len %ld beginLsn=%X/%X "
-					 "commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+					 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 					 msg->size - sizeof(AppendRequestHeader),
-					 LSN_FORMAT_ARGS(msg->req.beginLsn),
-					 LSN_FORMAT_ARGS(msg->req.commitLsn),
+					 LSN_FORMAT_ARGS(req->beginLsn),
+					 LSN_FORMAT_ARGS(req->commitLsn),
 					 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
 				/* We write with msg->size here because the body of the message
 				 * is stored after the end of the WalMessage struct, in the
 				 * allocation for each msg */
-				if (!AsyncWrite(i, &msg->req, msg->size, SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+				if (!AsyncWrite(i, req,
+								sizeof(AppendRequestHeader) + req->endLsn -
+									req->beginLsn,
+								SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
 					return;
 
 				break;
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 99e62142736..c455d0564e9 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -259,6 +259,12 @@ struct WalMessage
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
 	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
+	/*
+	 * By convention safekeeper starts receiving data since record boundary, we
+	 * may need to send first message not from the chunk beginning for that;
+	 * such trimmed message is formed here.
+	 */
+	AppendRequestHeader *perSafekeeper[MAX_WALKEEPERS];
 	AppendRequestHeader req; /* request to walkeeper (message header) */
 
 	/* PHANTOM FIELD:

From 202fb861fb59ef51050191773994c63b20d55fe6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:27 +0300
Subject: [PATCH 053/214] Silence compiler warnings:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    contrib/zenith/libpagestore.c: In function ‘zenith_connect’:
    contrib/zenith/libpagestore.c:125:2: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
      125 |  const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
          |  ^~~~~

    src/backend/tcop/zenith_wal_redo.c:294:2: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
      294 |  bool enable_seccomp = true;
          |  ^~~~

In the passing, also move the 'n_synced' local variable closer to where
it's used.
---
 contrib/zenith/libpagestore.c         | 30 ++++++++++++++-------------
 src/backend/replication/walproposer.c |  4 +++-
 src/backend/tcop/zenith_wal_redo.c    |  5 ++++-
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index b726cee80f8..fdcdf3b8990 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -117,22 +117,24 @@ zenith_connect()
 	}
 
 	// copy values from PQconninfoOption to key/value arrays because PQconnectdbParams accepts options this way
-	const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
-	const char **values = malloc((noptions + 1) * sizeof(*values));
-	int			 i = 0;
-	
-	for (i = 0; i < noptions; i++)
 	{
-		keywords[i] = conn_options[i].keyword;
-		values[i] = conn_options[i].val;
-	}
-	// add array terminator
-	keywords[i] = NULL;
-	values[i] = NULL;
+		const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
+		const char **values = malloc((noptions + 1) * sizeof(*values));
+		int			 i = 0;
+	
+		for (i = 0; i < noptions; i++)
+		{
+			keywords[i] = conn_options[i].keyword;
+			values[i] = conn_options[i].val;
+		}
+		// add array terminator
+		keywords[i] = NULL;
+		values[i] = NULL;
 
-	pageserver_conn = PQconnectdbParams(keywords, values, false);
-	free(keywords);
-	free(values);
+		pageserver_conn = PQconnectdbParams(keywords, values, false);
+		free(keywords);
+		free(values);
+	}
 
 	PQconninfoFree(conn_options);
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d5e9730c526..5eab36461f8 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -315,7 +315,6 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr minQuorumLsn;
-	int n_synced = 0;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
@@ -393,6 +392,9 @@ HandleWalKeeperResponse(void)
 	 */
 	if (syncSafekeepers)
 	{
+		int			n_synced;
+
+		n_synced = 0;
 		for (int i = 0; i < n_walkeepers; i++)
 		{
 			WalKeeper *wk = &walkeeper[i];
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 7e00a9e985d..15db900cc8a 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -156,6 +156,9 @@ WalRedoMain(int argc, char *argv[],
 {
 	int			firstchar;
 	StringInfoData input_message;
+#ifdef HAVE_LIBSECCOMP
+	bool		enable_seccomp;
+#endif
 
 	/* Initialize startup process environment if necessary. */
 	InitStandaloneProcess(argv[0]);
@@ -291,7 +294,7 @@ WalRedoMain(int argc, char *argv[],
 
 #ifdef HAVE_LIBSECCOMP
 	/* We prefer opt-out to opt-in for greater security */
-	bool enable_seccomp = true;
+	enable_seccomp = true;
 	for (int i = 1; i < argc; i++)
 		if (strcmp(argv[i], "--disable-seccomp") == 0)
 			enable_seccomp = false;

From 7c55bf3bae4e9c3e75da57f47c58ba515b533096 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:30 +0300
Subject: [PATCH 054/214] Remove unused functions for reading non-rel pages.

These could be used to fetch SLRUs and other non-relation things from the
page server. But we don't do that, and have no plans in the near future.
---
 contrib/zenith/pagestore_client.h |  3 --
 contrib/zenith/pagestore_smgr.c   | 73 ++-----------------------------
 2 files changed, 4 insertions(+), 72 deletions(-)

diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index dbcaa5fdb91..d94cbcb5185 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -123,9 +123,6 @@ extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber nblocks);
 extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
 
-extern bool zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum);
-extern void zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum);
-
 /* zenith wal-redo storage manager functionality */
 
 extern void inmem_init(void);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 569d1c330d8..be8277b2fee 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -414,7 +414,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool nonrel)
+zenith_get_request_lsn(void)
 {
 	XLogRecPtr lsn;
 
@@ -431,11 +431,6 @@ zenith_get_request_lsn(bool nonrel)
 		lsn = InvalidXLogRecPtr;
 		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
 	}
-	else if (nonrel)
-	{
-		lsn = GetLastImportantRecPtr();
-		elog(DEBUG1, "zenith_get_request_lsn norel GetLastImportantRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
-	}
 	else
 	{
 		XLogRecPtr flushlsn;
@@ -485,7 +480,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum
 		},
-		.lsn = zenith_get_request_lsn(false)
+		.lsn = zenith_get_request_lsn()
 	});
 	ok = resp->ok;
 	pfree(resp);
@@ -640,7 +635,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	ZenithResponse *resp;
 	XLogRecPtr request_lsn;
 
-	request_lsn = zenith_get_request_lsn(false);
+	request_lsn = zenith_get_request_lsn();
 	resp = page_server->request((ZenithRequest) {
 		.tag = T_ZenithReadRequest,
 		.page_key = {
@@ -765,66 +760,6 @@ hexdump_page(char *page)
 }
 #endif
 
-
-bool
-zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum)
-{
-	bool ok;
-	ZenithResponse *resp;
-
-	elog(SmgrTrace, "[ZENITH_SMGR] zenith_nonrel_page_exists relnode %u/%u/%u_%d blkno %u",
-		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno);
-
-	resp = page_server->request((ZenithRequest) {
-		.tag = T_ZenithExistsRequest,
-		.page_key = {
-			.rnode = rnode,
-			.forknum = forknum,
-			.blkno = blkno
-		},
-		.lsn = zenith_get_request_lsn(true)
-	});
-	ok = resp->ok;
-	pfree(resp);
-	return ok;
-}
-
-void
-zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum)
-{
-	int bufsize = BLCKSZ;
-	ZenithResponse *resp;
-	XLogRecPtr lsn;
-
-	//43 is magic for RELMAPPER_FILENAME in page cache
-	// relmapper files has non-standard size of 512bytes
-	if (forknum == 43)
-		bufsize = 512;
-
-	lsn = zenith_get_request_lsn(true);
-
-	elog(SmgrTrace, "[ZENITH_SMGR] read nonrel relnode %u/%u/%u_%d blkno %u lsn %X/%X",
-		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno,
-		(uint32) ((lsn) >> 32), (uint32) (lsn));
-
-	resp = page_server->request((ZenithRequest) {
-		.tag = T_ZenithReadRequest,
-		.page_key = {
-			.rnode = rnode,
-			.forknum = forknum,
-			.blkno = blkno
-		},
-		.lsn = lsn
-	});
-
-	if (!resp->ok)
-		elog(ERROR, "[ZENITH_SMGR] smgr page not found");
-
-	memcpy(buffer, resp->page, bufsize);
-	pfree(resp);
-}
-
-
 /*
  *	zenith_write() -- Write the supplied block at the appropriate location.
  *
@@ -867,7 +802,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
 		return n_blocks;
 
-	request_lsn = zenith_get_request_lsn(false);
+	request_lsn = zenith_get_request_lsn();
 	resp = page_server->request((ZenithRequest) {
 		.tag = T_ZenithNblocksRequest,
 		.page_key = {

From bd5979d95154673061005cf217fa57e6c625c5de Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:32 +0300
Subject: [PATCH 055/214] Misc cleanup in the code that communicates with the
 page server.

- Remove unused 'system_id' field from ZenithRequest.
- Remove unused 'loaded' variable.
- Remove unused to pack pageserver->client messages, and to unpack
  client->pageserver messages.
- Fix printing the response in debug message (was printing the request
  twice)
- Avoid the overhead of converting request/response to string, unless
  the debug message is really going to be printed
- Formatting fixes.
---
 contrib/zenith/libpagestore.c     | 15 ++++----
 contrib/zenith/pagestore_client.h | 23 ++++++------
 contrib/zenith/pagestore_smgr.c   | 61 ++++++++++---------------------
 3 files changed, 39 insertions(+), 60 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index fdcdf3b8990..2e33474e7e4 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -42,7 +42,7 @@ void		_PG_init(void);
 bool		connected = false;
 PGconn	   *pageserver_conn;
 
-static ZenithResponse * zenith_call(ZenithRequest request);
+static ZenithResponse * zenith_call(ZenithRequest *request);
 page_server_api api = {
 	.request = zenith_call
 };
@@ -202,7 +202,7 @@ zenith_connect()
 
 
 static ZenithResponse *
-zenith_call(ZenithRequest request)
+zenith_call(ZenithRequest *request)
 {
 	StringInfoData req_buff;
 	StringInfoData resp_buff;
@@ -219,7 +219,7 @@ zenith_call(ZenithRequest request)
 	if (!connected)
 		zenith_connect();
 
-	req_buff = zm_pack((ZenithMessage *) & request);
+	req_buff = zm_pack_request(request);
 
 	/* send request */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
@@ -229,8 +229,9 @@ zenith_call(ZenithRequest request)
 	}
 	pfree(req_buff.data);
 
+	if (message_level_is_interesting(PqPageStoreTrace))
 	{
-		char	   *msg = zm_to_string((ZenithMessage *) & request);
+		char	   *msg = zm_to_string((ZenithMessage *) request);
 
 		zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
 		pfree(msg);
@@ -245,21 +246,21 @@ zenith_call(ZenithRequest request)
 	else if (resp_buff.len == -2)
 		zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 
-	resp = zm_unpack(&resp_buff);
+	resp = zm_unpack_response(&resp_buff);
 	PQfreemem(resp_buff.data);
 
 	Assert(messageTag(resp) == T_ZenithStatusResponse
 		   || messageTag(resp) == T_ZenithNblocksResponse
 		   || messageTag(resp) == T_ZenithReadResponse);
 
+	if (message_level_is_interesting(PqPageStoreTrace))
 	{
-		char	   *msg = zm_to_string((ZenithMessage *) & request);
+		char	   *msg = zm_to_string((ZenithMessage *) resp);
 
 		zenith_log(PqPageStoreTrace, "Got response: %s", msg);
 		pfree(msg);
 	}
 
-
 	/*
 	 * XXX: zm_to_string leak strings. Check with what memory contex all this
 	 * methods are called.
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index d94cbcb5185..9600c974f70 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -36,14 +36,14 @@ typedef enum
 	T_ZenithStatusResponse = 100,
 	T_ZenithNblocksResponse,
 	T_ZenithReadResponse,
-}			ZenithMessageTag;
+} ZenithMessageTag;
 
 
 /* base struct for c-style inheritance */
 typedef struct
 {
 	ZenithMessageTag tag;
-}			ZenithMessage;
+} ZenithMessage;
 
 #define messageTag(m)		(((const ZenithMessage *)(m))->tag)
 
@@ -54,15 +54,14 @@ typedef struct
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-}			PageKey;
+} PageKey;
 
 typedef struct
 {
 	ZenithMessageTag tag;
-	uint64		system_id;
 	PageKey		page_key;
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-}			ZenithRequest;
+} ZenithRequest;
 
 typedef struct
 {
@@ -70,11 +69,11 @@ typedef struct
 	bool		ok;
 	uint32		n_blocks;
 	char		page[1];
-}			ZenithResponse;
+} ZenithResponse;
 
-StringInfoData zm_pack(ZenithMessage * msg);
-ZenithMessage *zm_unpack(StringInfo s);
-char	   *zm_to_string(ZenithMessage * msg);
+extern StringInfoData zm_pack_request(ZenithRequest *msg);
+extern ZenithMessage *zm_unpack_response(StringInfo s);
+extern char *zm_to_string(ZenithMessage *msg);
 
 /*
  * API
@@ -82,10 +81,10 @@ char	   *zm_to_string(ZenithMessage * msg);
 
 typedef struct
 {
-	ZenithResponse *(*request) (ZenithRequest request);
-}			page_server_api;
+	ZenithResponse *(*request) (ZenithRequest *request);
+} page_server_api;
 
-extern page_server_api * page_server;
+extern page_server_api *page_server;
 
 extern char *page_server_connstring;
 extern char *callmemaybe_connstring;
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index be8277b2fee..62c30808481 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -50,8 +50,6 @@ static char *hexdump_page(char *page);
 
 const int SmgrTrace = DEBUG5;
 
-bool loaded = false;
-
 page_server_api *page_server;
 
 /* GUCs */
@@ -72,7 +70,7 @@ char const *const ZenithMessageStr[] =
 };
 
 StringInfoData
-zm_pack(ZenithMessage *msg)
+zm_pack_request(ZenithRequest *msg)
 {
 	StringInfoData	s;
 
@@ -98,56 +96,25 @@ zm_pack(ZenithMessage *msg)
 			break;
 		}
 
-		/* pagestore -> pagestore_client */
+		/* pagestore -> pagestore_client. We never need to create these. */
 		case T_ZenithStatusResponse:
 		case T_ZenithNblocksResponse:
-		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
-			pq_sendbyte(&s, msg_resp->ok);
-			pq_sendint32(&s, msg_resp->n_blocks);
-			break;
-		}
 		case T_ZenithReadResponse:
-		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
-			pq_sendbyte(&s, msg_resp->ok);
-			pq_sendint32(&s, msg_resp->n_blocks);
-			pq_sendbytes(&s, msg_resp->page, BLCKSZ); // XXX: should be varlena
+		default:
+			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
 			break;
-		}
 	}
 	return s;
 }
 
 ZenithMessage *
-zm_unpack(StringInfo s)
+zm_unpack_response(StringInfo s)
 {
 	ZenithMessageTag tag = pq_getmsgbyte(s);
 	ZenithMessage *msg = NULL;
 
 	switch (tag)
 	{
-		/* pagestore_client -> pagestore */
-		case T_ZenithExistsRequest:
-		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
-		{
-			ZenithRequest *msg_req = palloc0(sizeof(ZenithRequest));
-
-			msg_req->tag = tag;
-			msg_req->system_id = 42;
-			msg_req->page_key.rnode.spcNode = pq_getmsgint(s, 4);
-			msg_req->page_key.rnode.dbNode = pq_getmsgint(s, 4);
-			msg_req->page_key.rnode.relNode = pq_getmsgint(s, 4);
-			msg_req->page_key.forknum = pq_getmsgbyte(s);
-			msg_req->page_key.blkno = pq_getmsgint(s, 4);
-			msg_req->lsn = pq_getmsgint64(s);
-			pq_getmsgend(s);
-
-			msg = (ZenithMessage *) msg_req;
-			break;
-		}
-
 		/* pagestore -> pagestore_client */
 		case T_ZenithStatusResponse:
 		case T_ZenithNblocksResponse:
@@ -176,6 +143,18 @@ zm_unpack(StringInfo s)
 			msg = (ZenithMessage *) msg_resp;
 			break;
 		}
+
+		/*
+		 * pagestore_client -> pagestore
+		 *
+		 * We create these ourselves, and don't need to decode them.
+		 */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		default:
+			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
+			break;
 	}
 
 	return msg;
@@ -474,7 +453,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		ok;
 	ZenithResponse *resp;
 
-	resp = page_server->request((ZenithRequest) {
+	resp = page_server->request(&(ZenithRequest) {
 		.tag = T_ZenithExistsRequest,
 		.page_key = {
 			.rnode = reln->smgr_rnode.node,
@@ -636,7 +615,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	XLogRecPtr request_lsn;
 
 	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request((ZenithRequest) {
+	resp = page_server->request(&(ZenithRequest) {
 		.tag = T_ZenithReadRequest,
 		.page_key = {
 			.rnode = reln->smgr_rnode.node,
@@ -803,7 +782,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 
 	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request((ZenithRequest) {
+	resp = page_server->request(&(ZenithRequest) {
 		.tag = T_ZenithNblocksRequest,
 		.page_key = {
 			.rnode = reln->smgr_rnode.node,

From 6ef307bab099e2e7f4bfef2eba6bfc41abe3c9c4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:34 +0300
Subject: [PATCH 056/214] Improve the protocol between Postgres and page
 server.

- Use different message formats for different kinds of response messages.
- Add an Error response message, for passing errors from page server to
  Postgres. An Error response now results in an ereport(ERROR
- Add a flag to requests, to indicate that we actually want the latest
  page version on the timeline, and the LSN is just a hint that we know
  that there haven't been any modifications since that LSN. It is currently
  always set to 'true', but once we start supporting read-only replicas,
  they would set it to false.

This changes the network postgres<->page server protocol, so this needs
corresponding changes in the page server side

Also refactor and fix the zm_to_string() function. The ZenithMessageStr
array was broken, because the array indices didn't match the
ZenithMessageTag enum values.
---
 contrib/zenith/libpagestore.c     |   6 +-
 contrib/zenith/pagestore_client.h |  68 ++++--
 contrib/zenith/pagestore_smgr.c   | 366 ++++++++++++++++++++++--------
 3 files changed, 323 insertions(+), 117 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 2e33474e7e4..88043adaffa 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -206,7 +206,7 @@ zenith_call(ZenithRequest *request)
 {
 	StringInfoData req_buff;
 	StringInfoData resp_buff;
-	ZenithMessage *resp;
+	ZenithResponse *resp;
 
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -249,10 +249,6 @@ zenith_call(ZenithRequest *request)
 	resp = zm_unpack_response(&resp_buff);
 	PQfreemem(resp_buff.data);
 
-	Assert(messageTag(resp) == T_ZenithStatusResponse
-		   || messageTag(resp) == T_ZenithNblocksResponse
-		   || messageTag(resp) == T_ZenithReadResponse);
-
 	if (message_level_is_interesting(PqPageStoreTrace))
 	{
 		char	   *msg = zm_to_string((ZenithMessage *) resp);
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 9600c974f70..073568f90c3 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -30,15 +30,15 @@ typedef enum
 	/* pagestore_client -> pagestore */
 	T_ZenithExistsRequest = 0,
 	T_ZenithNblocksRequest,
-	T_ZenithReadRequest,
+	T_ZenithGetPageRequest,
 
 	/* pagestore -> pagestore_client */
-	T_ZenithStatusResponse = 100,
+	T_ZenithExistsResponse = 100,
 	T_ZenithNblocksResponse,
-	T_ZenithReadResponse,
+	T_ZenithGetPageResponse,
+	T_ZenithErrorResponse,
 } ZenithMessageTag;
 
-
 /* base struct for c-style inheritance */
 typedef struct
 {
@@ -47,32 +47,74 @@ typedef struct
 
 #define messageTag(m)		(((const ZenithMessage *)(m))->tag)
 
-extern char const *const ZenithMessageStr[];
+/*
+ * supertype of all the Zenith*Request structs below
+ *
+ * If 'latest' is true, we are requesting the latest page version, and 'lsn'
+ * is just a hint to the server that we know there are no versions of the page
+ * (or relation size, for exists/nblocks requests) later than the 'lsn'.
+ */
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		latest;			/* if true, request latest page version */
+	XLogRecPtr	lsn;			/* request page version @ this LSN */
+} ZenithRequest;
+
+typedef struct
+{
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} ZenithExistsRequest;
 
 typedef struct
 {
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} ZenithNblocksRequest;
+
+typedef struct
+{
+	ZenithRequest req;
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-} PageKey;
+} ZenithGetPageRequest;
 
+/* supertype of all the Zenith*Response structs below */
 typedef struct
 {
 	ZenithMessageTag tag;
-	PageKey		page_key;
-	XLogRecPtr	lsn;			/* request page version @ this LSN */
-} ZenithRequest;
+} ZenithResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		exists;
+} ZenithExistsResponse;
 
 typedef struct
 {
 	ZenithMessageTag tag;
-	bool		ok;
 	uint32		n_blocks;
-	char		page[1];
-} ZenithResponse;
+} ZenithNblocksResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	char		page[FLEXIBLE_ARRAY_MEMBER];
+} ZenithGetPageResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */
+} ZenithErrorResponse;
 
 extern StringInfoData zm_pack_request(ZenithRequest *msg);
-extern ZenithMessage *zm_unpack_response(StringInfo s);
+extern ZenithResponse *zm_unpack_response(StringInfo s);
 extern char *zm_to_string(ZenithMessage *msg);
 
 /*
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 62c30808481..6a2745eb944 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -59,16 +59,6 @@ char *zenith_timeline;
 char *zenith_tenant;
 bool wal_redo = false;
 
-char const *const ZenithMessageStr[] =
-{
-	"ZenithExistsRequest",
-	"ZenithNblocksRequest",
-	"ZenithReadRequest",
-	"ZenithStatusResponse",
-	"ZenithReadResponse",
-	"ZenithNblocksResponse",
-};
-
 StringInfoData
 zm_pack_request(ZenithRequest *msg)
 {
@@ -81,25 +71,51 @@ zm_pack_request(ZenithRequest *msg)
 	{
 		/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
+		{
+			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+			pq_sendbyte(&s, msg_req->req.latest);
+			pq_sendint64(&s, msg_req->req.lsn);
+			pq_sendint32(&s, msg_req->rnode.spcNode);
+			pq_sendint32(&s, msg_req->rnode.dbNode);
+			pq_sendint32(&s, msg_req->rnode.relNode);
+			pq_sendbyte(&s, msg_req->forknum);
+
+			break;
+		}
 		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
 		{
-			ZenithRequest *msg_req = (ZenithRequest *) msg;
+			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
 
-			pq_sendint32(&s, msg_req->page_key.rnode.spcNode);
-			pq_sendint32(&s, msg_req->page_key.rnode.dbNode);
-			pq_sendint32(&s, msg_req->page_key.rnode.relNode);
-			pq_sendbyte(&s, msg_req->page_key.forknum);
-			pq_sendint32(&s, msg_req->page_key.blkno);
-			pq_sendint64(&s, msg_req->lsn);
+			pq_sendbyte(&s, msg_req->req.latest);
+			pq_sendint64(&s, msg_req->req.lsn);
+			pq_sendint32(&s, msg_req->rnode.spcNode);
+			pq_sendint32(&s, msg_req->rnode.dbNode);
+			pq_sendint32(&s, msg_req->rnode.relNode);
+			pq_sendbyte(&s, msg_req->forknum);
+
+			break;
+		}
+		case T_ZenithGetPageRequest:
+		{
+			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+			pq_sendbyte(&s, msg_req->req.latest);
+			pq_sendint64(&s, msg_req->req.lsn);
+			pq_sendint32(&s, msg_req->rnode.spcNode);
+			pq_sendint32(&s, msg_req->rnode.dbNode);
+			pq_sendint32(&s, msg_req->rnode.relNode);
+			pq_sendbyte(&s, msg_req->forknum);
+			pq_sendint32(&s, msg_req->blkno);
 
 			break;
 		}
 
 		/* pagestore -> pagestore_client. We never need to create these. */
-		case T_ZenithStatusResponse:
+		case T_ZenithExistsResponse:
 		case T_ZenithNblocksResponse:
-		case T_ZenithReadResponse:
+		case T_ZenithGetPageResponse:
+		case T_ZenithErrorResponse:
 		default:
 			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
 			break;
@@ -107,40 +123,66 @@ zm_pack_request(ZenithRequest *msg)
 	return s;
 }
 
-ZenithMessage *
+ZenithResponse *
 zm_unpack_response(StringInfo s)
 {
 	ZenithMessageTag tag = pq_getmsgbyte(s);
-	ZenithMessage *msg = NULL;
+	ZenithResponse *resp = NULL;
 
 	switch (tag)
 	{
 		/* pagestore -> pagestore_client */
-		case T_ZenithStatusResponse:
+		case T_ZenithExistsResponse:
+		{
+			ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
+
+			msg_resp->tag = tag;
+			msg_resp->exists = pq_getmsgbyte(s);
+			pq_getmsgend(s);
+
+			resp = (ZenithResponse *) msg_resp;
+			break;
+		}
+
 		case T_ZenithNblocksResponse:
 		{
-			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse));
+			ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
 
 			msg_resp->tag = tag;
-			msg_resp->ok = pq_getmsgbyte(s);
 			msg_resp->n_blocks = pq_getmsgint(s, 4);
 			pq_getmsgend(s);
 
-			msg = (ZenithMessage *) msg_resp;
+			resp = (ZenithResponse *) msg_resp;
 			break;
 		}
 
-		case T_ZenithReadResponse:
+		case T_ZenithGetPageResponse:
 		{
-			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse) + BLCKSZ);
+			ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
 
 			msg_resp->tag = tag;
-			msg_resp->ok = pq_getmsgbyte(s);
-			msg_resp->n_blocks = pq_getmsgint(s, 4);
 			memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); // XXX: should be varlena
 			pq_getmsgend(s);
 
-			msg = (ZenithMessage *) msg_resp;
+			resp = (ZenithResponse *) msg_resp;
+			break;
+		}
+
+		case T_ZenithErrorResponse:
+		{
+			ZenithErrorResponse *msg_resp;
+			size_t		msglen;
+			const char *msgtext;
+
+			msgtext = pq_getmsgrawstring(s);
+			msglen = strlen(msgtext);
+
+			msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
+			msg_resp->tag = tag;
+			memcpy(msg_resp->message, msgtext, msglen + 1);
+			pq_getmsgend(s);
+
+			resp = (ZenithResponse *) msg_resp;
 			break;
 		}
 
@@ -151,13 +193,13 @@ zm_unpack_response(StringInfo s)
 		 */
 		case T_ZenithExistsRequest:
 		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
+		case T_ZenithGetPageRequest:
 		default:
-			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
+			elog(ERROR, "unexpected zenith message tag 0x%02x", tag);
 			break;
 	}
 
-	return msg;
+	return resp;
 }
 
 /* dump to json for debugging / error reporting purposes */
@@ -168,52 +210,107 @@ zm_to_string(ZenithMessage *msg)
 
 	initStringInfo(&s);
 
-	appendStringInfoString(&s, "{");
-	appendStringInfo(&s, "\"type\": \"%s\"", ZenithMessageStr[msg->tag]);
-
 	switch (messageTag(msg))
 	{
 		/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
-		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
 		{
-			ZenithRequest *msg_req = (ZenithRequest *) msg;
+			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
+			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+							 msg_req->rnode.spcNode,
+							 msg_req->rnode.dbNode,
+							 msg_req->rnode.relNode);
+			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+			appendStringInfoChar(&s, '}');
+			break;
+		}
 
-			appendStringInfo(&s, ", \"page_key\": \"%d.%d.%d.%d.%u\", \"lsn\": \"%X/%X\"}",
-							 msg_req->page_key.rnode.spcNode,
-							 msg_req->page_key.rnode.dbNode,
-							 msg_req->page_key.rnode.relNode,
-							 msg_req->page_key.forknum,
-							 msg_req->page_key.blkno,
-							 (uint32) (msg_req->lsn >> 32), (uint32) (msg_req->lsn));
+		case T_ZenithNblocksRequest:
+		{
+			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
+			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+							 msg_req->rnode.spcNode,
+							 msg_req->rnode.dbNode,
+							 msg_req->rnode.relNode);
+			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+			appendStringInfoChar(&s, '}');
+			break;
+		}
 
+		case T_ZenithGetPageRequest:
+		{
+			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
+			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+							 msg_req->rnode.spcNode,
+							 msg_req->rnode.dbNode,
+							 msg_req->rnode.relNode);
+			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+			appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+			appendStringInfoChar(&s, '}');
 			break;
 		}
 
 		/* pagestore -> pagestore_client */
-		case T_ZenithStatusResponse:
-		case T_ZenithNblocksResponse:
+		case T_ZenithExistsResponse:
 		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
 
-			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u}",
-				msg_resp->ok,
-				msg_resp->n_blocks
+			appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
+			appendStringInfo(&s, ", \"exists\": %d}",
+				msg_resp->exists
 			);
+			appendStringInfoChar(&s, '}');
 
 			break;
 		}
-		case T_ZenithReadResponse:
+		case T_ZenithNblocksResponse:
 		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
 
-			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u, \"page\": \"XXX\"}",
-				msg_resp->ok,
+			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
+			appendStringInfo(&s, ", \"n_blocks\": %u}",
 				msg_resp->n_blocks
 			);
+			appendStringInfoChar(&s, '}');
+
 			break;
 		}
+		case T_ZenithGetPageResponse:
+		{
+#if 0
+			ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
+#endif
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
+			appendStringInfo(&s, ", \"page\": \"XXX\"}");
+			appendStringInfoChar(&s, '}');
+			break;
+		}
+		case T_ZenithErrorResponse:
+		{
+			ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
+
+			/* FIXME: escape double-quotes in the message */
+			appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
+			appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+			appendStringInfoChar(&s, '}');
+			break;
+		}
+
+		default:
+			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
 	}
 	return s.data;
 }
@@ -393,7 +490,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(void)
+zenith_get_request_lsn(bool *latest)
 {
 	XLogRecPtr lsn;
 
@@ -402,7 +499,6 @@ zenith_get_request_lsn(void)
 		lsn = GetXLogReplayRecPtr(NULL);
 		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			(uint32) ((lsn) >> 32), (uint32) (lsn));
-
 		lsn = InvalidXLogRecPtr;
 	}
 	else if (am_walsender)
@@ -440,6 +536,12 @@ zenith_get_request_lsn(void)
 			XLogFlush(lsn);
 		}
 	}
+
+	/*
+	 * FIXME: In read-only mode, we would need to set *latest=false here. But we don't
+	 * support read-only mode at the moment
+	 */
+	*latest = true;
 	return lsn;
 }
 
@@ -450,20 +552,46 @@ zenith_get_request_lsn(void)
 bool
 zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
-	bool		ok;
+	bool		exists;
 	ZenithResponse *resp;
+	bool		latest;
+	XLogRecPtr	request_lsn;
 
-	resp = page_server->request(&(ZenithRequest) {
-		.tag = T_ZenithExistsRequest,
-		.page_key = {
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithExistsRequest request = {
+			.req.tag = T_ZenithExistsRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum
-		},
-		.lsn = zenith_get_request_lsn()
-	});
-	ok = resp->ok;
+		};
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag) {
+		case T_ZenithExistsResponse:
+			exists = ((ZenithExistsResponse *) resp)->exists;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forkNum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s", 
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
 	pfree(resp);
-	return ok;
+	return exists;
 }
 
 /*
@@ -609,37 +737,52 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
  */
 void
 zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
-				 char *buffer)
+			char *buffer)
 {
 	ZenithResponse *resp;
+	bool		latest;
 	XLogRecPtr request_lsn;
 
-	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request(&(ZenithRequest) {
-		.tag = T_ZenithReadRequest,
-		.page_key = {
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithGetPageRequest request = {
+			.req.tag = T_ZenithGetPageRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum,
 			.blkno = blkno
-		},
-		.lsn = request_lsn
-	});
-
-	if (!resp->ok)
-		ereport(ERROR,
-			(errcode(ERRCODE_IO_ERROR),
-			errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-					blkno,
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode,
-					forkNum,
-					(uint32) (request_lsn >> 32), (uint32) request_lsn)));
-
-	memcpy(buffer, resp->page, BLCKSZ);
-	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED; /* Clear PD_WAL_LOGGED bit stored in WAL record */
+		};
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag) {
+		case T_ZenithGetPageResponse:
+			memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ);
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							blkno,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forkNum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s", 
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+
 	pfree(resp);
 
+	/* Clear PD_WAL_LOGGED bit stored in WAL record */
+	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED;
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -776,21 +919,46 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	ZenithResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr request_lsn;
+	bool		latest;
+	XLogRecPtr	request_lsn;
 
 	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
 		return n_blocks;
 
-	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request(&(ZenithRequest) {
-		.tag = T_ZenithNblocksRequest,
-		.page_key = {
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithNblocksRequest request = {
+			.req.tag = T_ZenithNblocksRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forknum,
-		},
-		.lsn = request_lsn
-	});
-	n_blocks = resp->n_blocks;
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag) {
+		case T_ZenithNblocksResponse:
+			n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s", 
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
 	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
 
 	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",

From ee5299a45844ea8d79b4d67a5c1ccf7369519c59 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:57:01 +0300
Subject: [PATCH 057/214] Run 'pgindent' on zenith-specific code in
 contrib/zenith and in walproposer.c

---
 contrib/zenith/libpagestore.c         |  92 +--
 contrib/zenith/pagestore_smgr.c       | 525 +++++++--------
 contrib/zenith/relsize_cache.c        |  42 +-
 src/backend/replication/walproposer.c | 884 +++++++++++++++-----------
 src/tools/pgindent/typedefs.list      |  15 +
 5 files changed, 880 insertions(+), 678 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 88043adaffa..1dc708f0ad7 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -42,7 +42,7 @@ void		_PG_init(void);
 bool		connected = false;
 PGconn	   *pageserver_conn;
 
-static ZenithResponse * zenith_call(ZenithRequest *request);
+static ZenithResponse *zenith_call(ZenithRequest *request);
 page_server_api api = {
 	.request = zenith_call
 };
@@ -50,32 +50,34 @@ page_server_api api = {
 static void
 zenith_connect()
 {
-	char			 *query;
-	int				  ret;
-	char			 *auth_token;
-	char			 *err = NULL;
+	char	   *query;
+	int			ret;
+	char	   *auth_token;
+	char	   *err = NULL;
 	PQconninfoOption *conn_options;
 	PQconninfoOption *conn_option;
-	int 			 noptions = 0;
+	int			noptions = 0;
 
-    // this is heavily inspired by psql/command.c::do_connect
-	conn_options = PQconninfoParse(
-		page_server_connstring,
-	 	&err
-	);
+	/* this is heavily inspired by psql/command.c::do_connect */
+	conn_options = PQconninfoParse(page_server_connstring, &err);
 
-	if (conn_options == NULL) {
+	if (conn_options == NULL)
+	{
 		/* The error string is malloc'd, so we must free it explicitly */
 		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+
 		PQfreemem(err);
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
-					errmsg("invalid connection string syntax: %s", errcopy)));
+				 errmsg("invalid connection string syntax: %s", errcopy)));
 	}
 
-	// Trying to populate pageserver connection string with auth token from environment.
-	// We are looking for password in with placeholder value like $ENV_VAR_NAME, so if password field is present 
-	// and starts with $ we try to fetch environment variable value and fail loudly if it is not set
+	/*
+	 * Trying to populate pageserver connection string with auth token from
+	 * environment. We are looking for password in with placeholder value like
+	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
+	 * to fetch environment variable value and fail loudly if it is not set.
+	 */
 	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
 	{
 		noptions++;
@@ -83,51 +85,51 @@ zenith_connect()
 		{
 			if (conn_option->val != NULL && conn_option->val[0] != '\0')
 			{
-				// ensure that this is a template
-				if (strncmp(conn_option->val, "$", 1) != 0) {
-					ereport(
-						ERROR,
-						(
-							errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])
-						)
-					);
-				}
-		
+				/* ensure that this is a template */
+				if (strncmp(conn_option->val, "$", 1) != 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
+
 				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
 				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token) {
-					ereport(
-						ERROR,
-						(
-							errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])
-						)
-					);
-				} else {
+				if (!auth_token)
+				{
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
+				}
+				else
+				{
 					zenith_log(LOG, "using auth token from environment passed via env");
 
-				// inspired by PQconninfoFree and conninfo_storeval
-				// so just free the old one and replace with freshly malloc'ed one
-				free(conn_option->val);
-				conn_option->val = strdup(auth_token);
+					/*
+					 * inspired by PQconninfoFree and conninfo_storeval so
+					 * just free the old one and replace with freshly
+					 * malloc'ed one
+					 */
+					free(conn_option->val);
+					conn_option->val = strdup(auth_token);
 				}
 			}
 		}
 	}
 
-	// copy values from PQconninfoOption to key/value arrays because PQconnectdbParams accepts options this way
+	/*
+	 * copy values from PQconninfoOption to key/value arrays because
+	 * PQconnectdbParams accepts options this way
+	 */
 	{
 		const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
 		const char **values = malloc((noptions + 1) * sizeof(*values));
-		int			 i = 0;
-	
+		int			i = 0;
+
 		for (i = 0; i < noptions; i++)
 		{
 			keywords[i] = conn_options[i].keyword;
 			values[i] = conn_options[i].val;
 		}
-		// add array terminator
+		/* add array terminator */
 		keywords[i] = NULL;
 		values[i] = NULL;
 
@@ -150,7 +152,7 @@ zenith_connect()
 	}
 
 	/* Ask the Page Server to connect to us, and stream WAL from us. */
-	if (callmemaybe_connstring && callmemaybe_connstring[0] 
+	if (callmemaybe_connstring && callmemaybe_connstring[0]
 		&& zenith_tenant
 		&& zenith_timeline)
 	{
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 6a2745eb944..ac7e94f74c0 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -48,70 +48,70 @@ static char *hexdump_page(char *page);
 
 #define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
 
-const int SmgrTrace = DEBUG5;
+const int	SmgrTrace = DEBUG5;
 
 page_server_api *page_server;
 
 /* GUCs */
-char *page_server_connstring;
-char *callmemaybe_connstring;
-char *zenith_timeline;
-char *zenith_tenant;
-bool wal_redo = false;
+char	   *page_server_connstring;
+char	   *callmemaybe_connstring;
+char	   *zenith_timeline;
+char	   *zenith_tenant;
+bool		wal_redo = false;
 
 StringInfoData
 zm_pack_request(ZenithRequest *msg)
 {
-	StringInfoData	s;
+	StringInfoData s;
 
 	initStringInfo(&s);
 	pq_sendbyte(&s, msg->tag);
 
 	switch (messageTag(msg))
 	{
-		/* pagestore_client -> pagestore */
+			/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
-		{
-			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+			{
+				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
 
-			pq_sendbyte(&s, msg_req->req.latest);
-			pq_sendint64(&s, msg_req->req.lsn);
-			pq_sendint32(&s, msg_req->rnode.spcNode);
-			pq_sendint32(&s, msg_req->rnode.dbNode);
-			pq_sendint32(&s, msg_req->rnode.relNode);
-			pq_sendbyte(&s, msg_req->forknum);
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithNblocksRequest:
-		{
-			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+			{
+				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
 
-			pq_sendbyte(&s, msg_req->req.latest);
-			pq_sendint64(&s, msg_req->req.lsn);
-			pq_sendint32(&s, msg_req->rnode.spcNode);
-			pq_sendint32(&s, msg_req->rnode.dbNode);
-			pq_sendint32(&s, msg_req->rnode.relNode);
-			pq_sendbyte(&s, msg_req->forknum);
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithGetPageRequest:
-		{
-			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+			{
+				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
 
-			pq_sendbyte(&s, msg_req->req.latest);
-			pq_sendint64(&s, msg_req->req.lsn);
-			pq_sendint32(&s, msg_req->rnode.spcNode);
-			pq_sendint32(&s, msg_req->rnode.dbNode);
-			pq_sendint32(&s, msg_req->rnode.relNode);
-			pq_sendbyte(&s, msg_req->forknum);
-			pq_sendint32(&s, msg_req->blkno);
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
+				pq_sendint32(&s, msg_req->blkno);
 
-			break;
-		}
+				break;
+			}
 
-		/* pagestore -> pagestore_client. We never need to create these. */
+			/* pagestore -> pagestore_client. We never need to create these. */
 		case T_ZenithExistsResponse:
 		case T_ZenithNblocksResponse:
 		case T_ZenithGetPageResponse:
@@ -131,66 +131,67 @@ zm_unpack_response(StringInfo s)
 
 	switch (tag)
 	{
-		/* pagestore -> pagestore_client */
+			/* pagestore -> pagestore_client */
 		case T_ZenithExistsResponse:
-		{
-			ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
+			{
+				ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
 
-			msg_resp->tag = tag;
-			msg_resp->exists = pq_getmsgbyte(s);
-			pq_getmsgend(s);
+				msg_resp->tag = tag;
+				msg_resp->exists = pq_getmsgbyte(s);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
 		case T_ZenithNblocksResponse:
-		{
-			ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
+			{
+				ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
 
-			msg_resp->tag = tag;
-			msg_resp->n_blocks = pq_getmsgint(s, 4);
-			pq_getmsgend(s);
+				msg_resp->tag = tag;
+				msg_resp->n_blocks = pq_getmsgint(s, 4);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
 		case T_ZenithGetPageResponse:
-		{
-			ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
+			{
+				ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
 
-			msg_resp->tag = tag;
-			memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); // XXX: should be varlena
-			pq_getmsgend(s);
+				msg_resp->tag = tag;
+				/* XXX:	should be varlena */
+				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
 		case T_ZenithErrorResponse:
-		{
-			ZenithErrorResponse *msg_resp;
-			size_t		msglen;
-			const char *msgtext;
+			{
+				ZenithErrorResponse *msg_resp;
+				size_t		msglen;
+				const char *msgtext;
 
-			msgtext = pq_getmsgrawstring(s);
-			msglen = strlen(msgtext);
+				msgtext = pq_getmsgrawstring(s);
+				msglen = strlen(msgtext);
 
-			msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
-			msg_resp->tag = tag;
-			memcpy(msg_resp->message, msgtext, msglen + 1);
-			pq_getmsgend(s);
+				msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
+				msg_resp->tag = tag;
+				memcpy(msg_resp->message, msgtext, msglen + 1);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
-		/*
-		 * pagestore_client -> pagestore
-		 *
-		 * We create these ourselves, and don't need to decode them.
-		 */
+			/*
+			 * pagestore_client -> pagestore
+			 *
+			 * We create these ourselves, and don't need to decode them.
+			 */
 		case T_ZenithExistsRequest:
 		case T_ZenithNblocksRequest:
 		case T_ZenithGetPageRequest:
@@ -206,108 +207,108 @@ zm_unpack_response(StringInfo s)
 char *
 zm_to_string(ZenithMessage *msg)
 {
-	StringInfoData	s;
+	StringInfoData s;
 
 	initStringInfo(&s);
 
 	switch (messageTag(msg))
 	{
-		/* pagestore_client -> pagestore */
+			/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
-		{
-			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
-
-			appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
-			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-							 msg_req->rnode.spcNode,
-							 msg_req->rnode.dbNode,
-							 msg_req->rnode.relNode);
-			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+			{
+				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
 		case T_ZenithNblocksRequest:
-		{
-			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
-
-			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
-			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-							 msg_req->rnode.spcNode,
-							 msg_req->rnode.dbNode,
-							 msg_req->rnode.relNode);
-			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+			{
+				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
 		case T_ZenithGetPageRequest:
-		{
-			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
-
-			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
-			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-							 msg_req->rnode.spcNode,
-							 msg_req->rnode.dbNode,
-							 msg_req->rnode.relNode);
-			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-			appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
-			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+			{
+				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
-		/* pagestore -> pagestore_client */
+			/* pagestore -> pagestore_client */
 		case T_ZenithExistsResponse:
-		{
-			ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
+			{
+				ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
 
-			appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
-			appendStringInfo(&s, ", \"exists\": %d}",
-				msg_resp->exists
-			);
-			appendStringInfoChar(&s, '}');
+				appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
+				appendStringInfo(&s, ", \"exists\": %d}",
+								 msg_resp->exists
+					);
+				appendStringInfoChar(&s, '}');
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithNblocksResponse:
-		{
-			ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
+			{
+				ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
 
-			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
-			appendStringInfo(&s, ", \"n_blocks\": %u}",
-				msg_resp->n_blocks
-			);
-			appendStringInfoChar(&s, '}');
+				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks
+					);
+				appendStringInfoChar(&s, '}');
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithGetPageResponse:
-		{
+			{
 #if 0
-			ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
+				ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
 #endif
 
-			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
-			appendStringInfo(&s, ", \"page\": \"XXX\"}");
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
+				appendStringInfo(&s, ", \"page\": \"XXX\"}");
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 		case T_ZenithErrorResponse:
-		{
-			ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
+			{
+				ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
 
-			/* FIXME: escape double-quotes in the message */
-			appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
-			appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+				/* FIXME: escape double-quotes in the message */
+				appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
+				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
 		default:
 			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
@@ -328,7 +329,7 @@ log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 	PGAlignedBlock copied_buffer;
 
 	/* set the flag in the original page, like log_newpage() does. */
-	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
+	((PageHeader) page)->pd_flags |= PD_WAL_LOGGED;
 
 	memcpy(copied_buffer.data, page, BLCKSZ);
 	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
@@ -338,19 +339,20 @@ log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 static void
 zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
 {
-	XLogRecPtr lsn = PageGetLSN(buffer);
+	XLogRecPtr	lsn = PageGetLSN(buffer);
 
 	if (ShutdownRequestPending)
 		return;
 
 	/*
-	 * If the page was not WAL-logged before eviction then we can lose its modification.
-	 * PD_WAL_LOGGED bit is used to mark pages which are wal-logged.
+	 * If the page was not WAL-logged before eviction then we can lose its
+	 * modification. PD_WAL_LOGGED bit is used to mark pages which are
+	 * wal-logged.
 	 *
 	 * See also comments to PD_WAL_LOGGED.
 	 *
-	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the whole index .
-	 * That's duplicative with the WAL-logging that we do here.
+	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the
+	 * whole index. That's duplicative with the WAL-logging that we do here.
 	 * See log_newpage_range() calls.
 	 *
 	 * FIXME: Redoing this record will set the LSN on the page. That could
@@ -359,7 +361,8 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
 	{
 		/* FSM is never WAL-logged and we don't care. */
-		XLogRecPtr recptr;
+		XLogRecPtr	recptr;
+
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
@@ -368,18 +371,19 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
+			 forknum, (uint32) lsn);
 	}
 	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
 	{
 		/*
-		 * Always WAL-log vm.
-		 * We should never miss clearing visibility map bits.
+		 * Always WAL-log vm. We should never miss clearing visibility map
+		 * bits.
 		 *
-		 * TODO Is it too bad for performance?
-		 * Hopefully we do not evict actively used vm too often.
+		 * TODO Is it too bad for performance? Hopefully we do not evict
+		 * actively used vm too often.
 		 */
-		XLogRecPtr recptr;
+		XLogRecPtr	recptr;
+
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
@@ -389,45 +393,48 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
+			 forknum, (uint32) lsn);
 	}
-	else if (!(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED)
-		&& !RecoveryInProgress())
+	else if (!(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED)
+			 && !RecoveryInProgress())
 	{
-		XLogRecPtr recptr;
+		XLogRecPtr	recptr;
+
 		/*
 		 * We assume standard page layout here.
 		 *
 		 * But at smgr level we don't really know what kind of a page this is.
-		 * We have filtered visibility map pages and fsm pages above.
-		 * TODO Do we have any special page types?
+		 * We have filtered visibility map pages and fsm pages above. TODO Do
+		 * we have any special page types?
 		 */
 
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
 
-		/* If we wal-log hint bits, someone could concurrently update page
-		 * and reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
+		/*
+		 * If we wal-log hint bits, someone could concurrently update page and
+		 * reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
 		 *
-		 * See comment to FlushBuffer().
-		 * The caller must hold a pin on the buffer and have share-locked the
-		 * buffer contents.  (Note: a share-lock does not prevent updates of
-		 * hint bits in the buffer, so the page could change while the write
-		 * is in progress, but we assume that that will not invalidate the data
-		 * written.)
+		 * See comment to FlushBuffer(). The caller must hold a pin on the
+		 * buffer and have share-locked the buffer contents.  (Note: a
+		 * share-lock does not prevent updates of hint bits in the buffer, so
+		 * the page could change while the write is in progress, but we assume
+		 * that that will not invalidate the data written.)
 		 */
-		Assert(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED); /* Should be set by log_newpage */
+		Assert(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED);	/* Should be set by
+																	 * log_newpage */
 
 		/*
-		 * Need to flush it too, so that it gets sent to the Page Server before we
-		 * might need to read it back. It should get flushed eventually anyway, at
-		 * least if there is some other WAL activity, so this isn't strictly
-		 * necessary for correctness. But if there is no other WAL activity, the
-		 * page read might get stuck waiting for the record to be streamed out
-		 * for an indefinite time.
+		 * Need to flush it too, so that it gets sent to the Page Server
+		 * before we might need to read it back. It should get flushed
+		 * eventually anyway, at least if there is some other WAL activity, so
+		 * this isn't strictly necessary for correctness. But if there is no
+		 * other WAL activity, the page read might get stuck waiting for the
+		 * record to be streamed out for an indefinite time.
 		 *
-		 * FIXME: Flushing the WAL is expensive. We should track the last "evicted"
-		 * LSN instead, and update it here. Or just kick the bgwriter to do the
-		 * flush, there is no need for us to block here waiting for it to finish.
+		 * FIXME: Flushing the WAL is expensive. We should track the last
+		 * "evicted" LSN instead, and update it here. Or just kick the
+		 * bgwriter to do the flush, there is no need for us to block here
+		 * waiting for it to finish.
 		 */
 		XLogFlush(recptr);
 		lsn = recptr;
@@ -436,14 +443,16 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
-	} else {
+			 forknum, (uint32) lsn);
+	}
+	else
+	{
 		elog(SmgrTrace, "Page %u of relation %u/%u/%u.%u is alread wal logged at lsn=%X",
 			 blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
+			 forknum, (uint32) lsn);
 	}
 	SetLastWrittenPageLSN(lsn);
 }
@@ -472,14 +481,15 @@ zenith_init(void)
 static XLogRecPtr
 zm_adjust_lsn(XLogRecPtr lsn)
 {
-	/* If lsn points to the beging of first record on page or segment,
-	 * then "return" it back to the page origin
+	/*
+	 * If lsn points to the beging of first record on page or segment, then
+	 * "return" it back to the page origin
 	 */
-	if ((lsn & (XLOG_BLCKSZ-1)) == SizeOfXLogShortPHD)
+	if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD)
 	{
 		lsn -= SizeOfXLogShortPHD;
 	}
-	else if ((lsn & (wal_segment_size-1)) == SizeOfXLogLongPHD)
+	else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD)
 	{
 		lsn -= SizeOfXLogLongPHD;
 	}
@@ -492,13 +502,13 @@ zm_adjust_lsn(XLogRecPtr lsn)
 static XLogRecPtr
 zenith_get_request_lsn(bool *latest)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	if (RecoveryInProgress())
 	{
 		lsn = GetXLogReplayRecPtr(NULL);
 		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
-			(uint32) ((lsn) >> 32), (uint32) (lsn));
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 		lsn = InvalidXLogRecPtr;
 	}
 	else if (am_walsender)
@@ -508,7 +518,7 @@ zenith_get_request_lsn(bool *latest)
 	}
 	else
 	{
-		XLogRecPtr flushlsn;
+		XLogRecPtr	flushlsn;
 
 		/*
 		 * Use the latest LSN that was evicted from the buffer cache. Any
@@ -518,14 +528,15 @@ zenith_get_request_lsn(bool *latest)
 		lsn = GetLastWrittenPageLSN();
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
-			(uint32) ((lsn) >> 32), (uint32) (lsn));
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 
 		lsn = zm_adjust_lsn(lsn);
 
 		/*
-		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
-		 * we shouldn't evict a page from the buffer cache before all its modifications have
-		 * been safely flushed. That's the "WAL before data" rule. But better safe than sorry.
+		 * Is it possible that the last-written LSN is ahead of last flush
+		 * LSN? Probably not, we shouldn't evict a page from the buffer cache
+		 * before all its modifications have been safely flushed. That's the
+		 * "WAL before data" rule. But better safe than sorry.
 		 */
 		flushlsn = GetFlushRecPtr();
 		if (lsn > flushlsn)
@@ -538,8 +549,8 @@ zenith_get_request_lsn(bool *latest)
 	}
 
 	/*
-	 * FIXME: In read-only mode, we would need to set *latest=false here. But we don't
-	 * support read-only mode at the moment
+	 * FIXME: In read-only mode, we would need to set *latest=false here. But
+	 * we don't support read-only mode at the moment
 	 */
 	*latest = true;
 	return lsn;
@@ -566,10 +577,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum
 		};
+
 		resp = page_server->request((ZenithRequest *) &request);
 	}
 
-	switch (resp->tag) {
+	switch (resp->tag)
+	{
 		case T_ZenithExistsResponse:
 			exists = ((ZenithExistsResponse *) resp)->exists;
 			break;
@@ -583,7 +596,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 							reln->smgr_rnode.node.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s", 
+					 errdetail("page server returned error: %s",
 							   ((ZenithErrorResponse *) resp)->message)));
 			break;
 
@@ -653,10 +666,10 @@ void
 zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			  char *buffer, bool skipFsync)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno+1);
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -721,7 +734,7 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
  */
 void
 zenith_writeback(SMgrRelation reln, ForkNumber forknum,
-					  BlockNumber blocknum, BlockNumber nblocks)
+				 BlockNumber blocknum, BlockNumber nblocks)
 {
 	/* not implemented */
 	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
@@ -741,7 +754,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 {
 	ZenithResponse *resp;
 	bool		latest;
-	XLogRecPtr request_lsn;
+	XLogRecPtr	request_lsn;
 
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
@@ -753,10 +766,12 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			.forknum = forkNum,
 			.blkno = blkno
 		};
+
 		resp = page_server->request((ZenithRequest *) &request);
 	}
 
-	switch (resp->tag) {
+	switch (resp->tag)
+	{
 		case T_ZenithGetPageResponse:
 			memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ);
 			break;
@@ -771,7 +786,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 							reln->smgr_rnode.node.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s", 
+					 errdetail("page server returned error: %s",
 							   ((ZenithErrorResponse *) resp)->message)));
 			break;
 
@@ -782,22 +797,24 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	pfree(resp);
 
 	/* Clear PD_WAL_LOGGED bit stored in WAL record */
-	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED;
+	((PageHeader) buffer)->pd_flags &= ~PD_WAL_LOGGED;
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
-		char pageserver_masked[BLCKSZ];
-		char mdbuf[BLCKSZ];
-		char mdbuf_masked[BLCKSZ];
+		char		pageserver_masked[BLCKSZ];
+		char		mdbuf[BLCKSZ];
+		char		mdbuf_masked[BLCKSZ];
 
 		mdread(reln, forkNum, blkno, mdbuf);
 
 		memcpy(pageserver_masked, buffer, BLCKSZ);
 		memcpy(mdbuf_masked, mdbuf, BLCKSZ);
 
-		if (PageIsNew(mdbuf)) {
-			if (!PageIsNew(pageserver_masked)) {
+		if (PageIsNew(mdbuf))
+		{
+			if (!PageIsNew(pageserver_masked))
+			{
 				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 					 blkno,
 					 reln->smgr_rnode.node.spcNode,
@@ -808,23 +825,25 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 					 hexdump_page(buffer));
 			}
 		}
-		else if (PageIsNew(buffer)) {
+		else if (PageIsNew(buffer))
+		{
 			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-					 blkno,
-					 reln->smgr_rnode.node.spcNode,
-					 reln->smgr_rnode.node.dbNode,
-					 reln->smgr_rnode.node.relNode,
-					 forkNum,
-					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-					 hexdump_page(mdbuf));
+				 blkno,
+				 reln->smgr_rnode.node.spcNode,
+				 reln->smgr_rnode.node.dbNode,
+				 reln->smgr_rnode.node.relNode,
+				 forkNum,
+				 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+				 hexdump_page(mdbuf));
 		}
 		else if (PageGetSpecialSize(mdbuf) == 0)
 		{
-			// assume heap
+			/* assume heap */
 			RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
 			RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
 
-			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+			{
 				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 					 blkno,
 					 reln->smgr_rnode.node.spcNode,
@@ -840,11 +859,12 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		{
 			if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
 			{
-				// assume btree
+				/* assume btree */
 				RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
 				RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
 
-				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+				{
 					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 						 blkno,
 						 reln->smgr_rnode.node.spcNode,
@@ -875,7 +895,7 @@ hexdump_page(char *page)
 			appendStringInfo(&result, " ");
 		if (i % 40 == 0)
 			appendStringInfo(&result, "\n");
-		appendStringInfo(&result, "%02x", (unsigned char)(page[i]));
+		appendStringInfo(&result, "%02x", (unsigned char) (page[i]));
 	}
 
 	return result.data;
@@ -893,7 +913,7 @@ void
 zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 char *buffer, bool skipFsync)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	zenith_wallog_page(reln, forknum, blocknum, buffer);
 
@@ -938,7 +958,8 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		resp = page_server->request((ZenithRequest *) &request);
 	}
 
-	switch (resp->tag) {
+	switch (resp->tag)
+	{
 		case T_ZenithNblocksResponse:
 			n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks;
 			break;
@@ -952,7 +973,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 							reln->smgr_rnode.node.relNode,
 							forknum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s", 
+					 errdetail("page server returned error: %s",
 							   ((ZenithErrorResponse *) resp)->message)));
 			break;
 
@@ -979,17 +1000,17 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 void
 zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
 
 	/*
-	 * Truncating a relation drops all its buffers from the buffer cache without
-	 * calling smgrwrite() on them. But we must account for that in our tracking
-	 * of last-written-LSN all the same: any future smgrnblocks() request must
-	 * return the new size after the truncation. We don't know what the LSN of
-	 * the truncation record was, so be conservative and use the most recently
-	 * inserted WAL record's LSN.
+	 * Truncating a relation drops all its buffers from the buffer cache
+	 * without calling smgrwrite() on them. But we must account for that in
+	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
+	 * request must return the new size after the truncation. We don't know
+	 * what the LSN of the truncation record was, so be conservative and use
+	 * the most recently inserted WAL record's LSN.
 	 */
 	lsn = GetXLogInsertRecPtr();
 
diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
index 5cb86e116a7..eb5b3f45a34 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/zenith/relsize_cache.c
@@ -33,15 +33,21 @@ typedef struct
 
 typedef struct
 {
-	RelTag tag;
+	RelTag		tag;
 	BlockNumber size;
 } RelSizeEntry;
 
 static HTAB *relsize_hash;
 static LWLockId relsize_lock;
-static int relsize_hash_size;
+static int	relsize_hash_size;
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
+/*
+ * Size of cache entry is 20 bytes. So 64 entry will take about 1.2 Mb,
+ * which seems to be a reasonable default.
+ */
+#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
+
 static void
 zenith_smgr_shmem_startup(void)
 {
@@ -51,7 +57,7 @@ zenith_smgr_shmem_startup(void)
 		prev_shmem_startup_hook();
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId)GetNamedLWLockTranche("zenith_relsize");
+	relsize_lock = (LWLockId) GetNamedLWLockTranche("zenith_relsize");
 	info.keysize = sizeof(RelTag);
 	info.entrysize = sizeof(RelSizeEntry);
 	relsize_hash = ShmemInitHash("zenith_relsize",
@@ -62,13 +68,14 @@ zenith_smgr_shmem_startup(void)
 }
 
 bool
-get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size)
+get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
 {
-	bool found = false;
+	bool		found = false;
+
 	if (relsize_hash_size > 0)
 	{
-		RelTag tag;
-		RelSizeEntry* entry;
+		RelTag		tag;
+		RelSizeEntry *entry;
 
 		tag.rnode = rnode;
 		tag.forknum = forknum;
@@ -89,8 +96,8 @@ set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 {
 	if (relsize_hash_size > 0)
 	{
-		RelTag tag;
-		RelSizeEntry* entry;
+		RelTag		tag;
+		RelSizeEntry *entry;
 
 		tag.rnode = rnode;
 		tag.forknum = forknum;
@@ -106,9 +113,9 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 {
 	if (relsize_hash_size > 0)
 	{
-		RelTag tag;
-		RelSizeEntry* entry;
-		bool found;
+		RelTag		tag;
+		RelSizeEntry *entry;
+		bool		found;
 
 		tag.rnode = rnode;
 		tag.forknum = forknum;
@@ -127,17 +134,12 @@ relsize_hash_init(void)
 							"Sets the maximum number of cached relation sizes for zenith",
 							NULL,
 							&relsize_hash_size,
-							/* 
-							 * Size of cache entry is 20 bytes.
-							 * So 64 entry will take about 1.2 Mb,
-							 * which seems to be a reasonable default.
-							 */
-							64*1024,
+							DEFAULT_RELSIZE_HASH_SIZE,
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
 							0,
-							NULL, NULL,	NULL);
+							NULL, NULL, NULL);
 
 	if (relsize_hash_size > 0)
 	{
@@ -147,4 +149,4 @@ relsize_hash_init(void)
 		prev_shmem_startup_hook = shmem_startup_hook;
 		shmem_startup_hook = zenith_smgr_shmem_startup;
 	}
-}
\ No newline at end of file
+}
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5eab36461f8..b2448102aa7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -55,63 +55,65 @@
 #include "utils/timestamp.h"
 
 
-char* wal_acceptors_list;
-int   wal_acceptor_reconnect_timeout;
-bool  am_wal_proposer;
+char	   *wal_acceptors_list;
+int			wal_acceptor_reconnect_timeout;
+bool		am_wal_proposer;
 
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
-WalProposerFunctionsType* WalProposerFunctions = NULL;
+WalProposerFunctionsType *WalProposerFunctions = NULL;
 
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
-static int          n_walkeepers = 0;
-static int          quorum = 0;
-static WalKeeper    walkeeper[MAX_WALKEEPERS];
-static WalMessage*  msgQueueHead;
-static WalMessage*  msgQueueTail;
-static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
-static XLogRecPtr	lastSentCommitLsn;	/* last commitLsn broadcast to walkeepers */
-static ProposerGreeting   proposerGreeting;
-static WaitEventSet* waitEvents;
+static int	n_walkeepers = 0;
+static int	quorum = 0;
+static WalKeeper walkeeper[MAX_WALKEEPERS];
+static WalMessage *msgQueueHead;
+static WalMessage *msgQueueTail;
+static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
+								 * this point */
+static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
+										 * walkeepers */
+static ProposerGreeting proposerGreeting;
+static WaitEventSet *waitEvents;
 static AppendResponse lastFeedback;
 /*
  *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
  *  + 1 of last chunk streamed to everyone)
  */
-static XLogRecPtr   truncateLsn;
-static XLogRecPtr   candidateTruncateLsn;
+static XLogRecPtr truncateLsn;
+static XLogRecPtr candidateTruncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
-static term_t       propTerm; /* term of the proposer */
-static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
-static term_t		donorEpoch; /* Most advanced acceptor epoch */
-static int          donor;     /* Most advanced acceptor */
-static int          n_votes = 0;
-static int          n_connected = 0;
-static TimestampTz  last_reconnect_attempt;
+static term_t propTerm;			/* term of the proposer */
+static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
+static term_t donorEpoch;		/* Most advanced acceptor epoch */
+static int	donor;				/* Most advanced acceptor */
+static int	n_votes = 0;
+static int	n_connected = 0;
+static TimestampTz last_reconnect_attempt;
 
 /* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
-static bool         syncSafekeepers;
+static bool syncSafekeepers;
 
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
-static bool AsyncRead(int i, void* value, size_t value_size);
-static bool BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state);
-static bool AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
+static bool AsyncRead(int i, void *value, size_t value_size);
+static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
+static bool AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
 static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
 static void HackyRemoveWalProposerEvent(int to_remove);
-static WalMessage* CreateMessageCommitLsnOnly(XLogRecPtr lsn);
-static void BroadcastMessage(WalMessage* msg);
+static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void BroadcastMessage(WalMessage *msg);
 
 
 /*
  * Combine hot standby feedbacks from all walkeepers.
  */
 static void
-CombineHotStanbyFeedbacks(HotStandbyFeedback* hs)
+CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 {
 	hs->ts = 0;
-	hs->xmin.value = ~0; /* largest unsigned value */
-	hs->catalog_xmin.value = ~0; /* largest unsigned value */
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
 
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -154,7 +156,7 @@ InitEventSet(void)
  * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
  */
 static void
-UpdateEventSet(WalKeeper* wk, uint32 events)
+UpdateEventSet(WalKeeper *wk, uint32 events)
 {
 	/* eventPos = -1 when we don't have an event */
 	Assert(wk->eventPos != -1);
@@ -170,19 +172,23 @@ static void
 HackyRemoveWalProposerEvent(int to_remove)
 {
 	/* Remove the existing event set */
-	if (waitEvents) {
+	if (waitEvents)
+	{
 		FreeWaitEventSet(waitEvents);
 		waitEvents = NULL;
 	}
 	/* Re-initialize it without adding any walkeeper events */
 	InitEventSet();
 
-	/* loop through the existing walkeepers. If they aren't the one we're removing, and if they have
-	 * a socket we can use, re-add the applicable events. */
+	/*
+	 * loop through the existing walkeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		uint32 desired_events = WL_NO_EVENTS;
-		WalKeeper* wk = &walkeeper[i];
+		uint32		desired_events = WL_NO_EVENTS;
+		WalKeeper  *wk = &walkeeper[i];
 
 		wk->eventPos = -1;
 
@@ -219,58 +225,75 @@ ShutdownConnection(int i)
 static void
 ResetConnection(int i)
 {
-	pgsocket sock; /* socket of the new connection */
-	WalKeeper *wk = &walkeeper[i];
+	pgsocket	sock;			/* socket of the new connection */
+	WalKeeper  *wk = &walkeeper[i];
 
 	if (wk->state != SS_OFFLINE)
 	{
 		ShutdownConnection(i);
 	}
 
-	/* Try to establish new connection
+	/*
+	 * Try to establish new connection
 	 *
 	 * If the connection information hasn't been filled out, we need to do
-	 * that here. */
+	 * that here.
+	 */
 	if (wk->conninfo[0] == '\0')
 	{
-		sprintf((char*) &wk->conninfo,
+		sprintf((char *) &wk->conninfo,
 				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
 				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	}
 
-	wk->conn = walprop_connect_start((char*) &wk->conninfo);
+	wk->conn = walprop_connect_start((char *) &wk->conninfo);
 
-	/* "If the result is null, then libpq has been unable to allocate a new PGconn structure" */
+	/*
+	 * "If the result is null, then libpq has been unable to allocate a new
+	 * PGconn structure"
+	 */
 	if (!wk->conn)
 		elog(FATAL, "failed to allocate new PGconn object");
 
-	/* PQconnectStart won't actually start connecting until we run PQconnectPoll. Before we do that
-	 * though, we need to check that it didn't immediately fail. */
+	/*
+	 * PQconnectStart won't actually start connecting until we run
+	 * PQconnectPoll. Before we do that though, we need to check that it
+	 * didn't immediately fail.
+	 */
 	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
 	{
-		/* According to libpq docs:
-		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed, typically
-		 *    because of invalid connection parameters."
+		/*---
+		 * According to libpq docs:
+		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed,
+		 *    typically because of invalid connection parameters."
 		 * We should report this failure.
 		 *
-		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */
+		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
+		 */
 		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
 			 wk->conninfo, walprop_error_message(wk->conn));
-		/* Even though the connection failed, we still need to clean up the object */
+
+		/*
+		 * Even though the connection failed, we still need to clean up the
+		 * object
+		 */
 		walprop_finish(wk->conn);
 		wk->conn = NULL;
 		return;
 	}
 
-	/* The documentation for PQconnectStart states that we should call PQconnectPoll in a loop until
-	 * it returns PGRES_POLLING_OK or PGRES_POLLING_FAILED. The other two possible returns indicate
-	 * whether we should wait for reading or writing on the socket. For the first iteration of the
-	 * loop, we're expected to wait until the socket becomes writable.
+	/*
+	 * The documentation for PQconnectStart states that we should call
+	 * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or
+	 * PGRES_POLLING_FAILED. The other two possible returns indicate whether
+	 * we should wait for reading or writing on the socket. For the first
+	 * iteration of the loop, we're expected to wait until the socket becomes
+	 * writable.
 	 *
-	 * The wording of the documentation is a little ambiguous; thankfully there's an example in the
-	 * postgres source itself showing this behavior.
-	 *   (see libpqrcv_connect, defined in
-	 *              src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
+	 * The wording of the documentation is a little ambiguous; thankfully
+	 * there's an example in the postgres source itself showing this behavior.
+	 * (see libpqrcv_connect, defined in
+	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
 	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
 
@@ -287,17 +310,18 @@ ResetConnection(int i)
 static XLogRecPtr
 GetAcknowledgedByQuorumWALPosition(void)
 {
-	XLogRecPtr responses[MAX_WALKEEPERS];
+	XLogRecPtr	responses[MAX_WALKEEPERS];
+
 	/*
 	 * Sort acknowledged LSNs
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
 		/*
-		 * Note that while we haven't pushed WAL up to epoch start lsn to the majority we
-		 * don't really know which LSN is reliably committed as reported
-		 * flush_lsn is physical end of wal, which can contain diverged
-		 * history (compared to donor).
+		 * Note that while we haven't pushed WAL up to epoch start lsn to the
+		 * majority we don't really know which LSN is reliably committed as
+		 * reported flush_lsn is physical end of wal, which can contain
+		 * diverged history (compared to donor).
 		 */
 		responses[i] = walkeeper[i].feedback.epoch == propTerm
 			? walkeeper[i].feedback.flushLsn : 0;
@@ -314,7 +338,7 @@ static void
 HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
-	XLogRecPtr minQuorumLsn;
+	XLogRecPtr	minQuorumLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
@@ -340,21 +364,24 @@ HandleWalKeeperResponse(void)
 	/* Cleanup message queue */
 	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1))
 	{
-		WalMessage* msg = msgQueueHead;
+		WalMessage *msg = msgQueueHead;
+
 		msgQueueHead = msg->next;
+
 		/*
 		 * This piece is received by everyone; try to advance truncateLsn, but
 		 * hold it back to nearest commitLsn. Thus we will always start
-		 * streaming from the beginning of the record, which simplifies decoding
-		 * on the far end.
+		 * streaming from the beginning of the record, which simplifies
+		 * decoding on the far end.
 		 *
 		 * This also prevents surprising violation of truncateLsn <= commitLsn
 		 * invariant which might occur because 1) truncateLsn can be advanced
-		 * immediately once chunk is broadcast to all safekeepers, and commitLsn
-		 * generally can't be advanced based on feedback from safekeeper who is
-		 * still in the previous epoch (similar to 'leader can't commit entries
-		 * from previous term' in Raft); 2) chunks we read from WAL and send are
-		 * plain sheets of bytes, but safekeepers ack only on commit boundaries.
+		 * immediately once chunk is broadcast to all safekeepers, and
+		 * commitLsn generally can't be advanced based on feedback from
+		 * safekeeper who is still in the previous epoch (similar to 'leader
+		 * can't commit entries from previous term' in Raft); 2) chunks we
+		 * read from WAL and send are plain sheets of bytes, but safekeepers
+		 * ack only on commit boundaries.
 		 */
 		if (msg->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
 		{
@@ -375,20 +402,20 @@ HandleWalKeeperResponse(void)
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
-	if (!msgQueueHead) /* queue is empty */
+	if (!msgQueueHead)			/* queue is empty */
 		msgQueueTail = NULL;
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are ready
-	 * to give all WAL to pageserver. It would mean whichever majority is alive,
-	 * there will be at least one safekeeper who is able to stream WAL to
-	 * pageserver to make basebackup possible. However, since at the moment we
-	 * don't have any good mechanism of defining the healthy and most advanced
-	 * safekeeper who should push the wal into pageserver and basically the
-	 * random one gets connected, to prevent hanging basebackup (due to
-	 * pageserver connecting to not-synced-walkeeper) we currently wait for all
-	 * seemingly alive walkeepers to get synced.
+	 * epochStartLsn and made the majority aware of it, ensuring they are
+	 * ready to give all WAL to pageserver. It would mean whichever majority
+	 * is alive, there will be at least one safekeeper who is able to stream
+	 * WAL to pageserver to make basebackup possible. However, since at the
+	 * moment we don't have any good mechanism of defining the healthy and
+	 * most advanced safekeeper who should push the wal into pageserver and
+	 * basically the random one gets connected, to prevent hanging basebackup
+	 * (due to pageserver connecting to not-synced-walkeeper) we currently
+	 * wait for all seemingly alive walkeepers to get synced.
 	 */
 	if (syncSafekeepers)
 	{
@@ -397,8 +424,8 @@ HandleWalKeeperResponse(void)
 		n_synced = 0;
 		for (int i = 0; i < n_walkeepers; i++)
 		{
-			WalKeeper *wk = &walkeeper[i];
-			bool synced = wk->feedback.commitLsn >= propEpochStartLsn;
+			WalKeeper  *wk = &walkeeper[i];
+			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
 
 			/* alive safekeeper which is not synced yet; wait for it */
 			if (wk->state != SS_OFFLINE && !synced)
@@ -415,16 +442,16 @@ HandleWalKeeperResponse(void)
 	}
 }
 
-char *zenith_timeline_walproposer = NULL;
-char *zenith_tenant_walproposer = NULL;
+char	   *zenith_timeline_walproposer = NULL;
+char	   *zenith_tenant_walproposer = NULL;
 
 
 static void
 WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 {
-	char* host;
-	char* sep;
-	char* port;
+	char	   *host;
+	char	   *sep;
+	char	   *port;
 
 	/* Load the libpq-specific functions */
 	load_file("libpqwalproposer", false);
@@ -439,14 +466,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
-		if (port == NULL) {
+		if (port == NULL)
+		{
 			elog(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
 		if (sep != NULL)
 			*sep++ = '\0';
-		if (n_walkeepers+1 >= MAX_WALKEEPERS)
+		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
 		{
 			elog(FATAL, "Too many walkeepers");
 		}
@@ -454,7 +482,11 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		walkeeper[n_walkeepers].port = port;
 		walkeeper[n_walkeepers].state = SS_OFFLINE;
 		walkeeper[n_walkeepers].conn = NULL;
-		/* Set conninfo to empty. We'll fill it out once later, in `ResetConnection` as needed */
+
+		/*
+		 * Set conninfo to empty. We'll fill it out once later, in
+		 * `ResetConnection` as needed
+		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		walkeeper[n_walkeepers].currMsg = NULL;
 		n_walkeepers += 1;
@@ -463,7 +495,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	{
 		elog(FATAL, "WalKeepers addresses are not specified");
 	}
-	quorum = n_walkeepers/2 + 1;
+	quorum = n_walkeepers / 2 + 1;
 
 	/* Fill the greeting package */
 	proposerGreeting.tag = 'g';
@@ -474,12 +506,12 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
-	 !HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
+		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
-	 !HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
+		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
 	proposerGreeting.timeline = ThisTimeLineID;
 	proposerGreeting.walSegSize = wal_segment_size;
@@ -526,7 +558,8 @@ WalProposerMain(Datum main_arg)
 
 	last_reconnect_attempt = GetCurrentTimestamp();
 
-	application_name = (char *) "walproposer"; /* for synchronous_standby_names */
+	application_name = (char *) "walproposer";	/* for
+												 * synchronous_standby_names */
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
@@ -596,6 +629,7 @@ static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
+
 	elog(LOG, "WAL proposer starts streaming at %X/%X",
 		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
@@ -611,15 +645,16 @@ WalProposerStartStreaming(XLogRecPtr startpos)
  * these before calling would be redundant work.
  */
 static void
-SendMessageToNode(int i, WalMessage* msg)
+SendMessageToNode(int i, WalMessage *msg)
 {
-	WalKeeper* wk = &walkeeper[i];
+	WalKeeper  *wk = &walkeeper[i];
 
 	/* we shouldn't be already sending something */
 	Assert(wk->currMsg == NULL);
+
 	/*
-	 * Skip already acknowledged messages. Used after reconnection to get to the
-	 * first not yet sent message. Otherwise we always just send 'msg'.
+	 * Skip already acknowledged messages. Used after reconnection to get to
+	 * the first not yet sent message. Otherwise we always just send 'msg'.
 	 */
 	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 		msg = msg->next;
@@ -632,7 +667,10 @@ SendMessageToNode(int i, WalMessage* msg)
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 		wk->currMsg->req.truncateLsn = truncateLsn;
 
-		/* Once we've selected and set up our message, actually start sending it. */
+		/*
+		 * Once we've selected and set up our message, actually start sending
+		 * it.
+		 */
 		wk->state = SS_SEND_WAL;
 		/* Don't ned to update the event set; that's done by AdvancePollState */
 
@@ -649,7 +687,7 @@ SendMessageToNode(int i, WalMessage* msg)
  * Broadcast new message to all caught-up walkeepers
  */
 static void
-BroadcastMessage(WalMessage* msg)
+BroadcastMessage(WalMessage *msg)
 {
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -660,12 +698,13 @@ BroadcastMessage(WalMessage* msg)
 	}
 }
 
-static WalMessage*
-CreateMessage(XLogRecPtr startpos, char* data, int len)
+static WalMessage *
+CreateMessage(XLogRecPtr startpos, char *data, int len)
 {
 	/* Create new message and append it to message queue */
-	WalMessage*	msg;
-	XLogRecPtr endpos;
+	WalMessage *msg;
+	XLogRecPtr	endpos;
+
 	len -= XLOG_HDR_SIZE;
 	endpos = startpos + len;
 	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
@@ -674,7 +713,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 		return NULL;
 	}
 	Assert(len >= 0);
-	msg = (WalMessage*)malloc(sizeof(WalMessage) + len);
+	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
 	if (msgQueueTail != NULL)
 		msgQueueTail->next = msg;
 	else
@@ -691,7 +730,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
 	msg->req.proposerId = proposerGreeting.proposerId;
-	memcpy(&msg->req+1, data + XLOG_HDR_SIZE, len);
+	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
 
 	Assert(msg->req.endLsn >= lastSentLsn);
 	lastSentLsn = msg->req.endLsn;
@@ -699,9 +738,10 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 }
 
 void
-WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
+WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
 {
-	WalMessage* msg = CreateMessage(startpos, data, len);
+	WalMessage *msg = CreateMessage(startpos, data, len);
+
 	if (msg != NULL)
 		BroadcastMessage(msg);
 }
@@ -710,13 +750,13 @@ WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
  * Create WAL message with no data, just to let the walkeepers
  * know that commit lsn has advanced.
  */
-static WalMessage*
+static WalMessage *
 CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 {
 	/* Create new message and append it to message queue */
-	WalMessage*	msg;
+	WalMessage *msg;
 
-	msg = (WalMessage*)malloc(sizeof(WalMessage));
+	msg = (WalMessage *) malloc(sizeof(WalMessage));
 	if (msgQueueTail != NULL)
 		msgQueueTail->next = msg;
 	else
@@ -730,20 +770,24 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
+
 	/*
-     * This serves two purposes:
-	 * 1) After all msgs from previous epochs are pushed we queue empty
-     *    WalMessage with lsn set to epochStartLsn which commands to switch the
-     *    epoch, which allows to do the switch without creating new epoch
-     *    records (we especially want to avoid such in --sync mode).
-	 *    Walproposer can advance commit_lsn only after the switch, so this lsn
-	 *    (reported back) also is the first possible advancement point.
+	 * This serves two purposes: 1) After all msgs from previous epochs are
+	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
+	 * commands to switch the epoch, which allows to do the switch without
+	 * creating new epoch records (we especially want to avoid such in --sync
+	 * mode). Walproposer can advance commit_lsn only after the switch, so
+	 * this lsn (reported back) also is the first possible advancement point.
 	 * 2) Maintain common invariant of queue entries sorted by LSN.
 	 */
 	msg->req.beginLsn = lsn;
 	msg->req.endLsn = lsn;
 	msg->req.proposerId = proposerGreeting.proposerId;
-	/* truncateLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
+
+	/*
+	 * truncateLsn and commitLsn are set just before the message sent, in
+	 * SendMessageToNode()
+	 */
 	return msg;
 }
 
@@ -779,10 +823,10 @@ DetermineEpochStartLsn(void)
 	}
 
 	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
-	 * committed yet. To keep the idea of always starting streaming since record
-	 * boundary (which simplifies decoding on safekeeper), take start position
-	 * of the slot.
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
+	 * was committed yet. To keep the idea of always starting streaming since
+	 * record boundary (which simplifies decoding on safekeeper), take start
+	 * position of the slot.
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
@@ -791,10 +835,11 @@ DetermineEpochStartLsn(void)
 		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
 	}
+
 	/*
-	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to some
-	 * connected safekeeper; it must have carried truncateLsn pointing to the
-	 * first record.
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
+	 * some connected safekeeper; it must have carried truncateLsn pointing to
+	 * the first record.
 	 */
 	Assert((truncateLsn != InvalidXLogRecPtr) ||
 		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
@@ -834,6 +879,7 @@ static void
 ReconnectWalKeepers(void)
 {
 	TimestampTz now = GetCurrentTimestamp();
+
 	if (TimeToReconnect(now) == 0)
 	{
 		last_reconnect_attempt = now;
@@ -851,8 +897,8 @@ ReconnectWalKeepers(void)
 static bool
 WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
 {
-	char conninfo[MAXCONNINFO];
-	char *err;
+	char		conninfo[MAXCONNINFO];
+	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 
@@ -880,18 +926,19 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 
 	if (walrcv_startstreaming(wrconn, &options))
 	{
-		XLogRecPtr rec_start_lsn;
-		XLogRecPtr rec_end_lsn = 0;
-		int len;
-		char *buf;
-		pgsocket wait_fd = PGINVALID_SOCKET;
+		XLogRecPtr	rec_start_lsn;
+		XLogRecPtr	rec_end_lsn = 0;
+		int			len;
+		char	   *buf;
+		pgsocket	wait_fd = PGINVALID_SOCKET;
+
 		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
 		{
 			if (len == 0)
 			{
 				(void) WaitLatchOrSocket(
-					MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-					-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
 			}
 			else
 			{
@@ -915,15 +962,15 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	{
 		ereport(LOG,
 				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
-						timeline, (uint32)(startpos >> 32), (uint32)startpos)));
+						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
 		return false;
 	}
 
 	/*
 	 * Start sending entries to everyone from the beginning (truncateLsn),
-	 * except for those who lives in donor's epoch and thus for sure has correct
-	 * WAL. We could do here even slightly better, taking into account commitLsn
-	 * of the rest to avoid sending them excessive data.
+	 * except for those who lives in donor's epoch and thus for sure has
+	 * correct WAL. We could do here even slightly better, taking into account
+	 * commitLsn of the rest to avoid sending them excessive data.
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -945,13 +992,13 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 				}
 				else
 				{
-					uint32 len;
-					uint32 size;
+					uint32		len;
+					uint32		size;
 
 					/*
-					 * By convention we always stream since the beginning of the
-					 * record, and flushLsn points to it -- form the message
-					 * starting there.
+					 * By convention we always stream since the beginning of
+					 * the record, and flushLsn points to it -- form the
+					 * message starting there.
 					 */
 					len = msg->req.endLsn - walkeeper[i].voteResponse.flushLsn;
 					size = sizeof(AppendRequestHeader) + len;
@@ -961,8 +1008,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 						walkeeper[i].voteResponse.flushLsn;
 					memcpy(&msg->perSafekeeper[i] + 1,
 						   (char *) (&msg->req + 1) +
-							   walkeeper[i].voteResponse.flushLsn -
-							   msg->req.beginLsn,
+						   walkeeper[i].voteResponse.flushLsn -
+						   msg->req.beginLsn,
 						   len);
 					SendMessageToNode(i, msg);
 					break;
@@ -981,25 +1028,28 @@ WalProposerPoll(void)
 {
 	while (true)
 	{
-		WalKeeper*  wk;
-		int         rc;
-		int         i;
+		WalKeeper  *wk;
+		int			rc;
+		int			i;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
 
 		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-						&event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		wk = (WalKeeper*) event.user_data;
-		i = (int)(wk - walkeeper);
+							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		wk = (WalKeeper *) event.user_data;
+		i = (int) (wk - walkeeper);
 
 		/*
 		 * If the event contains something that one of our walkeeper states
 		 * was waiting for, we'll advance its state.
 		 */
-		if (rc != 0 && (event.events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)))
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
 			AdvancePollState(i, event.events);
 
-		/* If the timeout expired, attempt to reconnect to any walkeepers that we dropped */
+		/*
+		 * If the timeout expired, attempt to reconnect to any walkeepers that
+		 * we dropped
+		 */
 		ReconnectWalKeepers();
 
 		/*
@@ -1020,91 +1070,116 @@ WalProposerPoll(void)
 static void
 AdvancePollState(int i, uint32 events)
 {
-	WalKeeper* wk = &walkeeper[i];
+	WalKeeper  *wk = &walkeeper[i];
 
-	/* Keep advancing the state while either:
-	 *   (a) the event is still unprocessed (usually because it's the first
-	 *       iteration of the loop), or
-	 *   (b) the state can execute, and does not need to wait for any socket
-	 *       events
+	/*
+	 * Keep advancing the state while either: (a) the event is still
+	 * unprocessed (usually because it's the first iteration of the loop), or
+	 * (b) the state can execute, and does not need to wait for any socket
+	 * events
 	 */
 	while (events || StateShouldImmediatelyExecute(wk->state))
 	{
-		/* Sanity check. We assume further down that the operations don't block
-		 * because the socket is ready. */
+		/*
+		 * Sanity check. We assume further down that the operations don't
+		 * block because the socket is ready.
+		 */
 		AssertEventsOkForState(events, wk);
 
 		/* Execute the code corresponding to the current state */
 		switch (wk->state)
 		{
-			/* WAL keepers are only taken out of SS_OFFLINE by calls to
-			 * ResetConnection */
+				/*
+				 * WAL keepers are only taken out of SS_OFFLINE by calls to
+				 * ResetConnection
+				 */
 			case SS_OFFLINE:
 				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
 					 wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+				break;			/* actually unreachable, but prevents
+								 * -Wimplicit-fallthrough */
 
-			/* Both connecting states run the same logic. The only difference is
-			 * the events they're expecting */
+				/*
+				 * Both connecting states run the same logic. The only
+				 * difference is the events they're expecting
+				 */
 			case SS_CONNECTING_READ:
 			case SS_CONNECTING_WRITE:
-			{
-				WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
-
-				/* The new set of events we'll wait on, after updating */
-				uint32 new_events = WL_NO_EVENTS;
-
-				switch (result)
 				{
-					case WP_CONN_POLLING_OK:
-						elog(LOG, "connected with node %s:%s", wk->host,
-							 wk->port);
-
-						/* Once we're fully connected, we can move to the next state */
-						wk->state = SS_EXEC_STARTWALPUSH;
-
-						/* Even though SS_EXEC_STARTWALPUSH doesn't wait on anything,
-						 * we do need to replace the current event, so we have to
-						 * just pick something. We'll eventually need the socket to
-						 * be readable, so we go with that. */
-						new_events = WL_SOCKET_READABLE;
-						break;
+					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
 
-					/* If we need to poll to finish connecting, continue doing that */
-					case WP_CONN_POLLING_READING:
-						wk->state = SS_CONNECTING_READ;
-						new_events = WL_SOCKET_READABLE;
-						break;
-					case WP_CONN_POLLING_WRITING:
-						wk->state = SS_CONNECTING_WRITE;
-						new_events = WL_SOCKET_WRITEABLE;
-						break;
+					/* The new set of events we'll wait on, after updating */
+					uint32		new_events = WL_NO_EVENTS;
 
-					case WP_CONN_POLLING_FAILED:
-						elog(WARNING, "Failed to connect to node '%s:%s': %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
-						/* If connecting failed, we don't want to restart the connection because
-						 * that might run us into a loop. Instead, shut it down -- it'll naturally
-						 * restart at a slower interval on calls to ReconnectWalKeepers. */
-						ShutdownConnection(i);
-						return;
-				}
+					switch (result)
+					{
+						case WP_CONN_POLLING_OK:
+							elog(LOG, "connected with node %s:%s", wk->host,
+								 wk->port);
+
+							/*
+							 * Once we're fully connected, we can move to the
+							 * next state
+							 */
+							wk->state = SS_EXEC_STARTWALPUSH;
+
+							/*
+							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
+							 * on anything, we do need to replace the current
+							 * event, so we have to just pick something. We'll
+							 * eventually need the socket to be readable, so
+							 * we go with that.
+							 */
+							new_events = WL_SOCKET_READABLE;
+							break;
+
+							/*
+							 * If we need to poll to finish connecting,
+							 * continue doing that
+							 */
+						case WP_CONN_POLLING_READING:
+							wk->state = SS_CONNECTING_READ;
+							new_events = WL_SOCKET_READABLE;
+							break;
+						case WP_CONN_POLLING_WRITING:
+							wk->state = SS_CONNECTING_WRITE;
+							new_events = WL_SOCKET_WRITEABLE;
+							break;
+
+						case WP_CONN_POLLING_FAILED:
+							elog(WARNING, "Failed to connect to node '%s:%s': %s",
+								 wk->host, wk->port, walprop_error_message(wk->conn));
+
+							/*
+							 * If connecting failed, we don't want to restart
+							 * the connection because that might run us into a
+							 * loop. Instead, shut it down -- it'll naturally
+							 * restart at a slower interval on calls to
+							 * ReconnectWalKeepers.
+							 */
+							ShutdownConnection(i);
+							return;
+					}
 
-				/* Because PQconnectPoll can change the socket, we have to
-				 * un-register the old event and re-register an event on the new
-				 * socket. */
-				HackyRemoveWalProposerEvent(i);
-				wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
-				break;
-			}
+					/*
+					 * Because PQconnectPoll can change the socket, we have to
+					 * un-register the old event and re-register an event on
+					 * the new socket.
+					 */
+					HackyRemoveWalProposerEvent(i);
+					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
+					break;
+				}
 
-			/* Send "START_WAL_PUSH" command to the walkeeper. After sending,
-			 * wait for response with SS_WAIT_EXEC_RESULT */
+				/*
+				 * Send "START_WAL_PUSH" command to the walkeeper. After
+				 * sending, wait for response with SS_WAIT_EXEC_RESULT
+				 */
 			case SS_EXEC_STARTWALPUSH:
 				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
+						 wk->host, wk->port, walprop_error_message(wk->conn));
 					ResetConnection(i);
 					return;
 				}
@@ -1116,59 +1191,88 @@ AdvancePollState(int i, uint32 events)
 			case SS_WAIT_EXEC_RESULT:
 				switch (walprop_get_query_result(wk->conn))
 				{
-					/* Successful result, move on to starting the handshake */
+						/*
+						 * Successful result, move on to starting the
+						 * handshake
+						 */
 					case WP_EXEC_SUCCESS_COPYBOTH:
-						/* Because this state is immediately executable, we'll
-						 * start this on the next iteration of the loop */
+
+						/*
+						 * Because this state is immediately executable, we'll
+						 * start this on the next iteration of the loop
+						 */
 						wk->state = SS_HANDSHAKE_SEND;
 						break;
 
-					/* Needs repeated calls to finish. Wait until the socket is
-					 * readable */
+						/*
+						 * Needs repeated calls to finish. Wait until the
+						 * socket is readable
+						 */
 					case WP_EXEC_NEEDS_INPUT:
-						/* SS_WAIT_EXEC_RESULT is always reached through an
-						 * event, so we don't need to update the event set */
+
+						/*
+						 * SS_WAIT_EXEC_RESULT is always reached through an
+						 * event, so we don't need to update the event set
+						 */
 						break;
 
 					case WP_EXEC_FAILED:
 						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-								wk->host, wk->port, walprop_error_message(wk->conn));
+							 wk->host, wk->port, walprop_error_message(wk->conn));
 						ResetConnection(i);
 						return;
 
-					/* Unexpected result -- funamdentally an error, but we want to produce a custom
-					 * message, rather than a generic "something went wrong" */
+						/*
+						 * Unexpected result -- funamdentally an error, but we
+						 * want to produce a custom message, rather than a
+						 * generic "something went wrong"
+						 */
 					case WP_EXEC_UNEXPECTED_SUCCESS:
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
-								wk->host, wk->port);
+							 wk->host, wk->port);
 						ResetConnection(i);
 						return;
 				}
 				break;
 
-			/* Start handshake: first of all send information about the WAL
-			 * keeper. After sending, we wait on SS_HANDSHAKE_RECV for a
-			 * response to finish the handshake. */
+				/*
+				 * Start handshake: first of all send information about the
+				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+				 * a response to finish the handshake.
+				 */
 			case SS_HANDSHAKE_SEND:
-				/* On failure, logging & resetting the connection is handled. We
-				 * just need to handle the control flow. */
+
+				/*
+				 * On failure, logging & resetting the connection is handled.
+				 * We just need to handle the control flow.
+				 */
 				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
 					return;
 
 				break;
 
-			/* Finish handshake comms: receive information about the WAL keeper */
+				/*
+				 * Finish handshake comms: receive information about the WAL
+				 * keeper
+				 */
 			case SS_HANDSHAKE_RECV:
-				/* If our reading doesn't immediately succeed, any necessary error handling or state
-				 * setting is taken care of. We can leave any other work until later. */
+
+				/*
+				 * If our reading doesn't immediately succeed, any necessary
+				 * error handling or state setting is taken care of. We can
+				 * leave any other work until later.
+				 */
 				if (!AsyncRead(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
 				/* Protocol is all good, move to voting. */
 				wk->state = SS_VOTING;
-				/* Don't need to update the event set yet. Either we update the
-				 * event set to WL_SOCKET_READABLE *or* we change the state to
-				 * SS_SEND_VOTE in the loop below */
+
+				/*
+				 * Don't need to update the event set yet. Either we update
+				 * the event set to WL_SOCKET_READABLE *or* we change the
+				 * state to SS_SEND_VOTE in the loop below
+				 */
 				UpdateEventSet(wk, WL_SOCKET_READABLE);
 				wk->feedback.flushLsn = truncateLsn;
 				wk->feedback.hs.ts = 0;
@@ -1179,14 +1283,19 @@ AdvancePollState(int i, uint32 events)
 				 */
 				propTerm = Max(walkeeper[i].greet.term, propTerm);
 
-				/* Check if we have quorum. If there aren't enough walkeepers, wait and do nothing.
-				 * We'll eventually get a task when the election starts.
+				/*
+				 * Check if we have quorum. If there aren't enough walkeepers,
+				 * wait and do nothing. We'll eventually get a task when the
+				 * election starts.
 				 *
-				 * If we do have quorum, we can start an election */
+				 * If we do have quorum, we can start an election
+				 */
 				if (++n_connected < quorum)
 				{
-					/* SS_VOTING is an idle state; read-ready indicates the
-					 * connection closed. */
+					/*
+					 * SS_VOTING is an idle state; read-ready indicates the
+					 * connection closed.
+					 */
 					UpdateEventSet(wk, WL_SOCKET_READABLE);
 				}
 				else
@@ -1195,19 +1304,26 @@ AdvancePollState(int i, uint32 events)
 					{
 						propTerm++;
 						/* prepare voting message */
-						voteRequest = (VoteRequest) {
+						voteRequest = (VoteRequest)
+						{
 							.tag = 'v',
 							.term = propTerm
 						};
 						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
 					}
 
-					/* Now send voting request to the cohort and wait responses */
+					/*
+					 * Now send voting request to the cohort and wait
+					 * responses
+					 */
 					for (int j = 0; j < n_walkeepers; j++)
 					{
-						/* Remember: SS_VOTING indicates that the walkeeper is participating in
-						 * voting, but hasn't sent anything yet. The ones that have sent something
-						 * are given SS_SEND_VOTE or SS_WAIT_VERDICT. */
+						/*
+						 * Remember: SS_VOTING indicates that the walkeeper is
+						 * participating in voting, but hasn't sent anything
+						 * yet. The ones that have sent something are given
+						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
+						 */
 						if (walkeeper[j].state == SS_VOTING)
 						{
 							walkeeper[j].state = SS_SEND_VOTE;
@@ -1218,16 +1334,18 @@ AdvancePollState(int i, uint32 events)
 				}
 				break;
 
-			/* Voting is an idle state - we don't expect any events to trigger. Refer to the
-			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
-			 * SS_SEND_VOTE. */
+				/*
+				 * Voting is an idle state - we don't expect any events to
+				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
+				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
+				 */
 			case SS_VOTING:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
 				ResetConnection(i);
 				break;
 
-			/* We have quorum for voting, send our vote request */
+				/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
 				/* On failure, logging & resetting is handled */
 				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
@@ -1236,10 +1354,14 @@ AdvancePollState(int i, uint32 events)
 				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 				break;
 
-			/* Start reading the walkeeper response for our candidate */
+				/* Start reading the walkeeper response for our candidate */
 			case SS_WAIT_VERDICT:
-				/* If our reading doesn't immediately succeed, any necessary error handling or state
-				 * setting is taken care of. We can leave any other work until later. */
+
+				/*
+				 * If our reading doesn't immediately succeed, any necessary
+				 * error handling or state setting is taken care of. We can
+				 * leave any other work until later.
+				 */
 				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
@@ -1259,8 +1381,8 @@ AdvancePollState(int i, uint32 events)
 					(wk->voteResponse.term > propTerm || n_votes < quorum))
 				{
 					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-						wk->host, wk->port,
-						wk->voteResponse.term, propTerm);
+						 wk->host, wk->port,
+						 wk->voteResponse.term, propTerm);
 				}
 				Assert(wk->voteResponse.term == propTerm);
 
@@ -1268,17 +1390,24 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes != quorum)
 				{
-					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+					/*
+					 * We are already streaming WAL: send all pending messages
+					 * to the attached walkeeper
+					 */
 					SendMessageToNode(i, msgQueueHead);
 				}
 				else
 				{
 					wk->state = SS_IDLE;
-					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for read-ready */
+					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+															 * read-ready */
 
 					DetermineEpochStartLsn();
 
-					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+					/*
+					 * Check if not all safekeepers are up-to-date, we need to
+					 * download WAL needed to synchronize them
+					 */
 					if (truncateLsn < propEpochStartLsn)
 					{
 						elog(LOG,
@@ -1289,11 +1418,12 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
+
 						/*
-						 * This message signifies epoch switch; it is needed to
-						 * make the switch happen on donor, as he won't get any
-						 * other messages until we start writing new WAL (and we
-						 * e.g. don't in --sync mode at all)
+						 * This message signifies epoch switch; it is needed
+						 * to make the switch happen on donor, as he won't get
+						 * any other messages until we start writing new WAL
+						 * (and we e.g. don't in --sync mode at all)
 						 */
 						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
@@ -1315,101 +1445,124 @@ AdvancePollState(int i, uint32 events)
 
 				break;
 
-			/* Idle state for sending WAL. Moved out only by calls to
-			 * SendMessageToNode */
+				/*
+				 * Idle state for sending WAL. Moved out only by calls to
+				 * SendMessageToNode
+				 */
 			case SS_IDLE:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
 				ResetConnection(i);
 				break;
 
-			/* Start to send the message at wk->currMsg. Triggered only by calls
-			 * to SendMessageToNode */
+				/*
+				 * Start to send the message at wk->currMsg. Triggered only by
+				 * calls to SendMessageToNode
+				 */
 			case SS_SEND_WAL:
-			{
-				WalMessage* msg = wk->currMsg;
-				AppendRequestHeader *req = &msg->req;
+				{
+					WalMessage *msg = wk->currMsg;
+					AppendRequestHeader *req = &msg->req;
 
-				/* if there is a message specially crafted for this safekeeper, send it */
-				if (msg->perSafekeeper[i])
-					req = msg->perSafekeeper[i];
+					/*
+					 * if there is a message specially crafted for this
+					 * safekeeper, send it
+					 */
+					if (msg->perSafekeeper[i])
+						req = msg->perSafekeeper[i];
 
-				elog(LOG,
-					 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-					 msg->size - sizeof(AppendRequestHeader),
-					 LSN_FORMAT_ARGS(req->beginLsn),
-					 LSN_FORMAT_ARGS(req->commitLsn),
-					 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
-
-				/* We write with msg->size here because the body of the message
-				 * is stored after the end of the WalMessage struct, in the
-				 * allocation for each msg */
-				if (!AsyncWrite(i, req,
-								sizeof(AppendRequestHeader) + req->endLsn -
+					elog(LOG,
+						 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						 msg->size - sizeof(AppendRequestHeader),
+						 LSN_FORMAT_ARGS(req->beginLsn),
+						 LSN_FORMAT_ARGS(req->commitLsn),
+						 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+
+					/*
+					 * We write with msg->size here because the body of the
+					 * message is stored after the end of the WalMessage
+					 * struct, in the allocation for each msg
+					 */
+					if (!AsyncWrite(i, req,
+									sizeof(AppendRequestHeader) + req->endLsn -
 									req->beginLsn,
-								SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
-					return;
+									SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+						return;
 
-				break;
-			}
+					break;
+				}
 
-			/* Flush the WAL message we're sending from SS_SEND_WAL */
+				/* Flush the WAL message we're sending from SS_SEND_WAL */
 			case SS_SEND_WAL_FLUSH:
-				/* AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll wait
-				 * until the next poll comes along. */
+
+				/*
+				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll
+				 * wait until the next poll comes along.
+				 */
 				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0, SS_RECV_FEEDBACK))
 					return;
 
 				break;
 
-			/* Start to receive the feedback from a message sent via SS_SEND_WAL */
+				/*
+				 * Start to receive the feedback from a message sent via
+				 * SS_SEND_WAL
+				 */
 			case SS_RECV_FEEDBACK:
-			{
-				WalMessage* next;
-				XLogRecPtr  minQuorumLsn;
-
-				/* If our reading doesn't immediately succeed, any necessary error handling or state
-				 * setting is taken care of. We can leave any other work until later. */
-				if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
-					return;
+				{
+					WalMessage *next;
+					XLogRecPtr	minQuorumLsn;
 
-				next = wk->currMsg->next;
-				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+					/*
+					 * If our reading doesn't immediately succeed, any
+					 * necessary error handling or state setting is taken care
+					 * of. We can leave any other work until later.
+					 */
+					if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
+						return;
 
-				wk->currMsg = NULL;
-				HandleWalKeeperResponse();
-				SendMessageToNode(i, next); /* Updates state & event set */
+					next = wk->currMsg->next;
+					wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms
+													 * receiving of this
+													 * message */
 
-				/*
-				 * Also send the new commit lsn to all the walkeepers.
-				 *
-				 * FIXME: This is redundant for walkeepers that have other outbound messages
-				 * pending.
-				 */
-				minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+					wk->currMsg = NULL;
+					HandleWalKeeperResponse();
+					SendMessageToNode(i, next); /* Updates state & event set */
 
-				if (minQuorumLsn > lastSentCommitLsn)
-				{
-					BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
 					/*
-					 * commitLsn is always the record boundary; remember it so
-					 * we can advance truncateLsn there. But do so only if
-					 * previous value is applied, otherwise it might never catch
-					 * up.
+					 * Also send the new commit lsn to all the walkeepers.
+					 *
+					 * FIXME: This is redundant for walkeepers that have other
+					 * outbound messages pending.
 					 */
-					if (candidateTruncateLsn == InvalidXLogRecPtr)
+					minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+
+					if (minQuorumLsn > lastSentCommitLsn)
 					{
-						candidateTruncateLsn = minQuorumLsn;
+						BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+
+						/*
+						 * commitLsn is always the record boundary; remember
+						 * it so we can advance truncateLsn there. But do so
+						 * only if previous value is applied, otherwise it
+						 * might never catch up.
+						 */
+						if (candidateTruncateLsn == InvalidXLogRecPtr)
+						{
+							candidateTruncateLsn = minQuorumLsn;
+						}
+						lastSentCommitLsn = minQuorumLsn;
 					}
-					lastSentCommitLsn = minQuorumLsn;
+					break;
 				}
-				break;
-			}
 		}
 
-		/* We've already done something for these events - don't attempt more
-		 * states than we need to. */
+		/*
+		 * We've already done something for these events - don't attempt more
+		 * states than we need to.
+		 */
 		events = WL_NO_EVENTS;
 	}
 }
@@ -1423,44 +1576,47 @@ AdvancePollState(int i, uint32 events)
  * failed, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncRead(int i, void* value, size_t value_size)
+AsyncRead(int i, void *value, size_t value_size)
 {
-	WalKeeper* wk = &walkeeper[i];
-	char *buf = NULL;
-	int buf_size = -1;
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	char	   *buf = NULL;
+	int			buf_size = -1;
+	uint32		events;
 
 	switch (walprop_async_read(wk->conn, &buf, &buf_size))
 	{
-		/* On success, there's just a couple more things we'll check below */
+			/* On success, there's just a couple more things we'll check below */
 		case PG_ASYNC_READ_SUCCESS:
 			break;
 
-		/* If we need more input, wait until the socket is read-ready and try
-		 * again. */
+			/*
+			 * If we need more input, wait until the socket is read-ready and
+			 * try again.
+			 */
 		case PG_ASYNC_READ_TRY_AGAIN:
 			UpdateEventSet(wk, WL_SOCKET_READABLE);
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
 			elog(WARNING, "Failed to read from node %s:%s in %s state: %s",
-				wk->host, wk->port,
-				FormatWalKeeperState(wk->state),
-				walprop_error_message(wk->conn));
+				 wk->host, wk->port,
+				 FormatWalKeeperState(wk->state),
+				 walprop_error_message(wk->conn));
 			ResetConnection(i);
 			return false;
 	}
 
 	/*
-	 * If we get here, the read was ok, but we still need to check it was the right amount
+	 * If we get here, the read was ok, but we still need to check it was the
+	 * right amount
 	 */
 	if ((size_t) buf_size != value_size)
 	{
 		elog(FATAL,
-			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
-			wk->host, wk->port,
-			FormatWalKeeperState(wk->state),
-			value_size, buf_size);
+			 "Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
+			 wk->host, wk->port,
+			 FormatWalKeeperState(wk->state),
+			 value_size, buf_size);
 	}
 
 	/* Copy the resulting info into place */
@@ -1481,10 +1637,10 @@ AsyncRead(int i, void* value, size_t value_size)
  * single packet.
  */
 static bool
-BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
+BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 {
-	WalKeeper* wk = &walkeeper[i];
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	uint32		events;
 
 	if (!walprop_blocking_write(wk->conn, msg, msg_size))
 	{
@@ -1497,8 +1653,10 @@ BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
 
 	wk->state = success_state;
 
-	/* If the new state will be waiting for events to happen, update the event
-	 * set to wait for those */
+	/*
+	 * If the new state will be waiting for events to happen, update the event
+	 * set to wait for those
+	 */
 	events = WalKeeperStateDesiredEvents(success_state);
 	if (events)
 		UpdateEventSet(wk, events);
@@ -1515,10 +1673,10 @@ BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
  * emitted and the connection is reset.
  */
 static bool
-AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
+AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
 {
-	WalKeeper* wk = &walkeeper[i];
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	uint32		events;
 
 	switch (walprop_async_write(wk->conn, msg, msg_size))
 	{
@@ -1526,9 +1684,12 @@ AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKee
 			wk->state = success_state;
 			break;
 		case PG_ASYNC_WRITE_TRY_FLUSH:
-			/* We still need to call PQflush some more to finish the job; go to
-			 * the appropriate state. Update the event set at the bottom of this
-			 * function */
+
+			/*
+			 * We still need to call PQflush some more to finish the job; go
+			 * to the appropriate state. Update the event set at the bottom of
+			 * this function
+			 */
 			wk->state = flush_state;
 			break;
 		case PG_ASYNC_WRITE_FAIL:
@@ -1559,10 +1720,11 @@ AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKee
 static bool
 AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
 {
-	WalKeeper* wk = &walkeeper[i];
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	uint32		events;
 
-	/* PQflush returns:
+	/*---
+	 * PQflush returns:
 	 *   0 if successful                    [we're good to move on]
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index fb59bd3be73..fe848491300 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2160,6 +2160,8 @@ RelMapFile
 RelMapping
 RelOptInfo
 RelOptKind
+RelSizeEntry
+RelTag
 RelToCheck
 RelToCluster
 RelabelType
@@ -2848,6 +2850,8 @@ WaitEventTimeout
 WaitPMResult
 WalCloseMethod
 WalLevel
+WalKeeper
+WalMessage
 WalRcvData
 WalRcvExecResult
 WalRcvExecStatus
@@ -2951,6 +2955,17 @@ XmlTableBuilderData
 YYLTYPE
 YYSTYPE
 YY_BUFFER_STATE
+ZenithErrorResponse
+ZenithExistsRequest
+ZenithExistsResponse
+ZenithGetPageRequest
+ZenithGetPageResponse
+ZenithMessage
+ZenithMessageTag
+ZenithNblocksRequest
+ZenithNblocksResponse
+ZenithRequest
+ZenithResponse
 _SPI_connection
 _SPI_plan
 __AssignProcessToJobObject

From b935705ba791837f46ad424982142ab0fd762f86 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 20 Sep 2021 15:00:32 +0300
Subject: [PATCH 058/214] Fix a badly worded comment

---
 contrib/zenith/relsize_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
index eb5b3f45a34..0ba99a128f9 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/zenith/relsize_cache.c
@@ -43,8 +43,8 @@ static int	relsize_hash_size;
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
 /*
- * Size of cache entry is 20 bytes. So 64 entry will take about 1.2 Mb,
- * which seems to be a reasonable default.
+ * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
+ * which seems reasonable.
  */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
 

From 23a6e79bee9b1845cb3242334e9dff01d0c24cd1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 16 Sep 2021 15:20:03 +0300
Subject: [PATCH 059/214] Simplify a2e929e by storing starting point in
 walkeeper itself.

---
 src/backend/replication/walproposer.c | 63 ++++++++++++++-------------
 src/include/replication/walproposer.h | 15 +++----
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index b2448102aa7..35624a77352 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -394,11 +394,6 @@ HandleWalKeeperResponse(void)
 			truncateLsn = candidateTruncateLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			if (msg->perSafekeeper[i])
-				free(msg->perSafekeeper[i]);
-		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
@@ -489,6 +484,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		walkeeper[n_walkeepers].currMsg = NULL;
+		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_walkeepers += 1;
 	}
 	if (n_walkeepers < 1)
@@ -723,7 +719,6 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
 	msg->ackMask = 0;
-	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -766,7 +761,6 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
 	msg->ackMask = 0;
-	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -992,25 +986,11 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 				}
 				else
 				{
-					uint32		len;
-					uint32		size;
-
 					/*
 					 * By convention we always stream since the beginning of
-					 * the record, and flushLsn points to it -- form the
-					 * message starting there.
+					 * the record, and flushLsn points to it.
 					 */
-					len = msg->req.endLsn - walkeeper[i].voteResponse.flushLsn;
-					size = sizeof(AppendRequestHeader) + len;
-					msg->perSafekeeper[i] = malloc(size);
-					*msg->perSafekeeper[i] = msg->req;
-					msg->perSafekeeper[i]->beginLsn =
-						walkeeper[i].voteResponse.flushLsn;
-					memcpy(&msg->perSafekeeper[i] + 1,
-						   (char *) (&msg->req + 1) +
-						   walkeeper[i].voteResponse.flushLsn -
-						   msg->req.beginLsn,
-						   len);
+					walkeeper[i].startStreamingAt = walkeeper[i].voteResponse.flushLsn;
 					SendMessageToNode(i, msg);
 					break;
 				}
@@ -1307,7 +1287,7 @@ AdvancePollState(int i, uint32 events)
 						voteRequest = (VoteRequest)
 						{
 							.tag = 'v',
-							.term = propTerm
+								.term = propTerm
 						};
 						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
 					}
@@ -1465,16 +1445,33 @@ AdvancePollState(int i, uint32 events)
 					AppendRequestHeader *req = &msg->req;
 
 					/*
-					 * if there is a message specially crafted for this
-					 * safekeeper, send it
+					 * If we need to send this message not from the beginning,
+					 * form the cut version. Only happens for the first
+					 * message.
 					 */
-					if (msg->perSafekeeper[i])
-						req = msg->perSafekeeper[i];
+					if (wk->startStreamingAt > msg->req.beginLsn)
+					{
+						uint32		len;
+						uint32		size;
+
+						Assert(wk->startStreamingAt < req->endLsn);
+
+						len = msg->req.endLsn - wk->startStreamingAt;
+						size = sizeof(AppendRequestHeader) + len;
+						req = malloc(size);
+						*req = msg->req;
+						req->beginLsn = wk->startStreamingAt;
+						memcpy(req + 1,
+							   (char *) (&msg->req + 1) + wk->startStreamingAt -
+							   msg->req.beginLsn,
+							   len);
+					}
 
 					elog(LOG,
-						 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						 msg->size - sizeof(AppendRequestHeader),
+						 "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						 req->endLsn - req->beginLsn,
 						 LSN_FORMAT_ARGS(req->beginLsn),
+						 LSN_FORMAT_ARGS(req->endLsn),
 						 LSN_FORMAT_ARGS(req->commitLsn),
 						 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
@@ -1487,7 +1484,13 @@ AdvancePollState(int i, uint32 events)
 									sizeof(AppendRequestHeader) + req->endLsn -
 									req->beginLsn,
 									SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+					{
+						if (req != &msg->req)
+							free(req);
 						return;
+					}
+					if (req != &msg->req)
+						free(req);
 
 					break;
 				}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index c455d0564e9..222faaea41d 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -258,13 +258,7 @@ struct WalMessage
 {
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
-	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
-	/*
-	 * By convention safekeeper starts receiving data since record boundary, we
-	 * may need to send first message not from the chunk beginning for that;
-	 * such trimmed message is formed here.
-	 */
-	AppendRequestHeader *perSafekeeper[MAX_WALKEEPERS];
+	uint32 ackMask; /* mask of receivers acknowledged receiving of this message */
 	AppendRequestHeader req; /* request to walkeeper (message header) */
 
 	/* PHANTOM FIELD:
@@ -327,7 +321,12 @@ typedef struct WalKeeper
 	WalKeeperState     state;         /* walkeeper state machine state */
 	AcceptorGreeting   greet;         /* acceptor greeting  */
 	VoteResponse	   voteResponse;  /* the vote */
-	AppendResponse  feedback;      /* feedback to master */
+	AppendResponse feedback;		  /* feedback to master */
+	/*
+	 * streaming must be started at the record boundary which is saved here, if
+	 * it differs from the chunk start
+	 */
+	XLogRecPtr startStreamingAt;
 } WalKeeper;
 
 

From 4ed35d5ef7d7712bf0182380fe16f6014394dd76 Mon Sep 17 00:00:00 2001
From: sharnoff <github@max.sharnoff.org>
Date: Wed, 22 Sep 2021 10:54:47 -0700
Subject: [PATCH 060/214] Catch walkeeper ErrorResponse in PQgetCopyData

PQgetCopyData can sometimes indicate that the copy is done if the
backend returns an error response. So while we still expect that the
walkeeper never sends CopyDone, we can't expect it to never produce
errors.
---
 .../libpqwalproposer/libpqwalproposer.c       | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 1b8a53b5066..f538ed9133f 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -273,16 +273,32 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	 *  (> 0) if it was successful; that value is the amount transferred.
 	 *
 	 * The protocol we use between walproposer and walkeeper means that we
-	 * (i.e. walproposer) won't ever receive a message saying that the copy
-	 * is done. */
+	 * *usually* wouldn't expect to see that the copy is done, but this can
+	 * sometimes be triggered by the server returning an ErrorResponse (which
+	 * also happens to have the effect that the copy is done).
+	 */
 	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
 	{
 		case 0:
 			return PG_ASYNC_READ_TRY_AGAIN;
 		case -1:
-			/* As mentioned above; this shouldn't happen */
-			elog(FATAL, "unexpected return -1 from PQgetCopyData");
-			break;
+		{
+			/*
+			 * If we get -1, it's probably because of a server error; the
+			 * walkeeper won't normally send a CopyDone message.
+			 *
+			 * We can check PQgetResult to make sure that the server failed;
+			 * it'll always result in PGRES_FATAL_ERROR
+			 */
+			ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+
+			if (status != PGRES_FATAL_ERROR)
+				elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+
+			/* If there was actually an error, it'll be properly reported by
+			 * calls to PQerrorMessage -- we don't have to do anything else */
+			return PG_ASYNC_READ_FAIL;
+		}
 		case -2:
 			return PG_ASYNC_READ_FAIL;
 		default:

From 1871809c2c6cb700ba2a6e0ba3458ec685bcb4e6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 24 Sep 2021 19:48:36 +0300
Subject: [PATCH 061/214] Use buffered I/O for reading commands from stdin.

Whatever the bug mentioned in the FIXME comment was with buffered I/O,
it has been fixed now. This greatly reduces the amount of CPU time spent
in WAL redo.
---
 src/backend/tcop/zenith_wal_redo.c | 74 ++++++++----------------------
 1 file changed, 18 insertions(+), 56 deletions(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 15db900cc8a..be8cc59a94b 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -109,6 +109,7 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(exit_group),
 		PG_SCMP_ALLOW(pselect6),
 		PG_SCMP_ALLOW(read),
+		PG_SCMP_ALLOW(fstat), /* needed by fread() */
 		PG_SCMP_ALLOW(select),
 		PG_SCMP_ALLOW(write),
 
@@ -433,84 +434,45 @@ pprint_tag(BufferTag *tag)
  *	EOF is returned if end-of-file input is seen; time to shut down.
  * ----------------
  */
-
-/*
- * Wait until there is data in stdin. Prints a log message every 10 s whil
- * waiting.
- */
-static void
-wait_with_timeout(void)
-{
-	for (;;)
-	{
-		struct timeval timeout = {10, 0};
-		fd_set		fds;
-		int			ret;
-
-		FD_ZERO(&fds);
-		FD_SET(STDIN_FILENO, &fds);
-
-		ret = select(1, &fds, NULL, NULL, &timeout);
-		if (ret != 0)
-			break;
-		elog(DEBUG1, "still alive");
-	}
-}
-
 static int
 ReadRedoCommand(StringInfo inBuf)
 {
-	char		c;
+	char		hdr[1 + sizeof(int32)];
 	int			qtype;
 	int32		len;
-	int			nread;
-
-	/* FIXME: Use unbuffered I/O here, because the WAL redo process was getting
-	 * stuck with buffered I/O. I'm not sure why, or whether the bug was somewhere
-	 * in here or in the calling page server side.
-	 */
-	wait_with_timeout();
-	if (read(STDIN_FILENO, &c, 1) == 0)
-		return EOF;
-	qtype = c;
 
-	/*
-	 * Like in the FE/BE protocol, all messages have a length word next
-	 * after the type code; we can read the message contents independently of
-	 * the type.
-	 */
-	if (read(STDIN_FILENO, &len, 4) != 4)
+	/* Read message type and message length */
+	if (fread(hdr, 1, sizeof(hdr), stdin) != sizeof(hdr))
 	{
-		ereport(ERROR,
-				(errcode(ERRCODE_PROTOCOL_VIOLATION),
-				 errmsg("could not read message length")));
+		if (ferror(stdin) != 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("could not read message header")));
+		return EOF;
 	}
-
+	qtype = hdr[0];
+	memcpy(&len, &hdr[1], sizeof(int32));
 	len = pg_ntoh32(len);
 
 	if (len < 4)
-	{
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("invalid message length")));
-		return EOF;
-	}
 
 	len -= 4;					/* discount length itself */
 
+	/* Read the message payload */
 	enlargeStringInfo(inBuf, len);
-	nread = 0;
-	while (nread < len) {
-		int n = read(STDIN_FILENO, inBuf->data + nread, len - nread);
-		if (n == -1)
+	if (fread(inBuf->data, 1, len, stdin) != len)
+	{
+		if (ferror(stdin) != 0)
 			ereport(ERROR,
-					(errcode(ERRCODE_PROTOCOL_VIOLATION),
-					 errmsg("read error: %m")));
-		if (n == 0)
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("could not read message")));
+		else
 			ereport(ERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
 					 errmsg("unexpected EOF")));
-		nread += n;
 	}
 	inBuf->len = len;
 	inBuf->data[len] = '\0';

From 27e3a504bdbac9c76834b57b710579f4a89461b7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 30 Sep 2021 10:27:36 +0300
Subject: [PATCH 062/214] Replace fread() with plain read() and a hand-written
 buffer.

The fread() call required allowing the 'fstat' syscall in the seccomp
configuration, and apparently on some platforms also 'newfstatat', as
Max reported this error:

    Sep 28 15:56:55.522 ERRO wal-redo-postgres: ---------------------------------------
    Sep 28 15:56:55.522 ERRO wal-redo-postgres: seccomp: bad syscall 262
    Sep 28 15:56:55.522 ERRO wal-redo-postgres: ---------------------------------------

I'm afraid of allowing 'newfstatat', that seems like it's opening too
much attack surface, since it allows access to files by filename. Maybe
it's OK, but I'm not sure, but there isn't any fundamental reason why
we'd need to call it, I'm not sure why glibc's fread() wants to call it.
So let's avoid the trouble by writing our own simple buffer over plain
read().
---
 src/backend/tcop/zenith_wal_redo.c | 84 +++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 8 deletions(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index be8cc59a94b..a02592fc0c2 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -94,6 +94,7 @@ static void PushPage(StringInfo input_message);
 static void ApplyRecord(StringInfo input_message);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
+static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
@@ -109,7 +110,6 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(exit_group),
 		PG_SCMP_ALLOW(pselect6),
 		PG_SCMP_ALLOW(read),
-		PG_SCMP_ALLOW(fstat), /* needed by fread() */
 		PG_SCMP_ALLOW(select),
 		PG_SCMP_ALLOW(write),
 
@@ -352,6 +352,8 @@ WalRedoMain(int argc, char *argv[],
 				 * EOF means we're done. Perform normal shutdown.
 				 */
 			case EOF:
+				ereport(LOG,
+						(errmsg("received EOF on stdin, shutting down")));
 
 #ifdef HAVE_LIBSECCOMP
 				/*
@@ -437,19 +439,27 @@ pprint_tag(BufferTag *tag)
 static int
 ReadRedoCommand(StringInfo inBuf)
 {
+	ssize_t		ret;
 	char		hdr[1 + sizeof(int32)];
 	int			qtype;
 	int32		len;
 
 	/* Read message type and message length */
-	if (fread(hdr, 1, sizeof(hdr), stdin) != sizeof(hdr))
+	ret = buffered_read(hdr, sizeof(hdr));
+	if (ret != sizeof(hdr))
 	{
-		if (ferror(stdin) != 0)
+		if (ret == 0)
+			return EOF;
+		else if (ret < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONNECTION_FAILURE),
-					 errmsg("could not read message header")));
-		return EOF;
+					 errmsg("could not read message header: %m")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("unexpected EOF")));
 	}
+
 	qtype = hdr[0];
 	memcpy(&len, &hdr[1], sizeof(int32));
 	len = pg_ntoh32(len);
@@ -463,12 +473,13 @@ ReadRedoCommand(StringInfo inBuf)
 
 	/* Read the message payload */
 	enlargeStringInfo(inBuf, len);
-	if (fread(inBuf->data, 1, len, stdin) != len)
+	ret = buffered_read(inBuf->data, len);
+	if (ret != len)
 	{
-		if (ferror(stdin) != 0)
+		if (ret < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONNECTION_FAILURE),
-					 errmsg("could not read message")));
+					 errmsg("could not read message: %m")));
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -679,3 +690,60 @@ GetPage(StringInfo input_message)
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }
+
+
+/* Buffer used by buffered_read() */
+static char stdin_buf[16 * 1024];
+static size_t stdin_len = 0;	/* # of bytes in buffer */
+static size_t stdin_ptr = 0;	/* # of bytes already consumed */
+
+/*
+ * Like read() on stdin, but buffered.
+ *
+ * We cannot use libc's buffered fread(), because it uses syscalls that we
+ * have disabled with seccomp(). Depending on the platform, it can call
+ * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat'
+ * seems problematic because it allows interrogating files by path name.
+ *
+ * The return value is the number of bytes read. On error, -1 is returned, and
+ * errno is set appropriately. Unlike read(), this fills the buffer completely
+ * unless an error happens or EOF is reached.
+ */
+static ssize_t
+buffered_read(void *buf, size_t count)
+{
+	char	   *dst = buf;
+
+	while (count > 0)
+	{
+		size_t		nthis;
+
+		if (stdin_ptr == stdin_len)
+		{
+			ssize_t		ret;
+
+			ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf));
+			if (ret < 0)
+			{
+				/* don't do anything here that could set 'errno' */
+				return ret;
+			}
+			if (ret == 0)
+			{
+				/* EOF */
+				break;
+			}
+			stdin_len = (size_t) ret;
+			stdin_ptr = 0;
+		}
+		nthis = Min(stdin_len - stdin_ptr, count);
+
+		memcpy(dst, &stdin_buf[stdin_ptr], nthis);
+
+		stdin_ptr += nthis;
+		count -= nthis;
+		dst += nthis;
+	}
+
+	return (dst - (char *) buf);
+}

From afa880df4b6ae42d68dfa657d634c2a6fb46839d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 6 Oct 2021 10:57:00 +0300
Subject: [PATCH 063/214] Store unlogged tables locally, and replace
 PD_WAL_LOGGED.

The smgr implementation needs to distinguish between unlogged/temp and
regular 'permanent' relations, but the smgr API doesn't currently include
that information. Add a 'relpersistence' field to SmgrRelationData, and
as an argument to smgropen(). However, not all callers of smgropen()
have a relcache entry at hand, so we allow some operations to pass 0,
meaning 'unknown'.

Now that we can store unlogged tables locally, use the same machinery
to handle the buffered GiST and SP-GiST index builds. They populate the
index by inserting all the tuples, and use the shared buffer cache while
they do that. They don't WAL-log the pages while they do that, they log
the whole relation as a separate bulk operation after the build has
finished. That poses a problem for Zenith, where smgrwrite() is a no-op
and we rely on WAL-logging to reconstruct the pages. Solve that problem by
storing the pages locally in the compute node, like an unlogged relation,
until the index build finishes and all the pages have been WAL-logged.
To do that, the smgr needs to know when the caller is an unlogged build
operation like that, so add functions to the Smgr API for that.

With this commit, we no longer generate an FPI record whenever a rel is
extended with an all-zeros page. See github issue #482. That greatly
reduces the amount of WAL generated during bulk loading.
---
 contrib/zenith/pagestore_client.h        |   1 +
 contrib/zenith/pagestore_smgr.c          | 629 +++++++++++++++++++----
 contrib/zenith/relsize_cache.c           |  15 +
 src/backend/access/common/bufmask.c      |   2 -
 src/backend/access/gin/gininsert.c       |   7 +
 src/backend/access/gist/gistbuild.c      |  15 +-
 src/backend/access/gist/gistutil.c       |   2 -
 src/backend/access/heap/heapam_handler.c |   2 +-
 src/backend/access/spgist/spginsert.c    |   8 +-
 src/backend/access/transam/xloginsert.c  |  15 +-
 src/backend/access/transam/xlogutils.c   |   2 +-
 src/backend/catalog/storage.c            |  10 +-
 src/backend/commands/tablecmds.c         |   8 +-
 src/backend/storage/buffer/bufmgr.c      |  20 +-
 src/backend/storage/buffer/localbuf.c    |   2 +-
 src/backend/storage/page/bufpage.c       |   1 -
 src/backend/storage/smgr/md.c            |   4 +-
 src/backend/storage/smgr/smgr.c          |  43 +-
 src/backend/tcop/zenith_wal_redo.c       |   3 +-
 src/backend/utils/adt/dbsize.c           |  26 +-
 src/include/storage/bufmgr.h             |   2 +
 src/include/storage/bufpage.h            |  19 +-
 src/include/storage/smgr.h               |  13 +-
 src/include/utils/rel.h                  |   6 +-
 24 files changed, 672 insertions(+), 183 deletions(-)

diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 073568f90c3..3643971f254 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -193,5 +193,6 @@ extern void relsize_hash_init(void);
 extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
 extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
 extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 
 #endif
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index ac7e94f74c0..99914365428 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -3,6 +3,37 @@
  * pagestore_smgr.c
  *
  *
+ *
+ * Temporary and unlogged rels
+ * ---------------------------
+ *
+ * Temporary and unlogged tables are stored locally, by md.c. The functions
+ * here just pass the calls through to corresponding md.c functions.
+ *
+ * Index build operations that use the buffer cache are also handled locally,
+ * just like unlogged tables. Such operations must be marked by calling
+ * smgr_start_unlogged_build() and friends.
+ *
+ * In order to know what relations are permanent and which ones are not, we
+ * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
+ * by smgropen() callers, when they have the relcache entry at hand.  However,
+ * sometimes we need to open an SmgrRelation for a relation without the
+ * relcache. That is needed when we evict a buffer; we might not have the
+ * SmgrRelation for that relation open yet. To deal with that, the
+ * 'relpersistence' can be left to zero, meaning we don't know if it's
+ * permanent or not. Most operations are not allowed with relpersistence==0,
+ * but smgrwrite() does work, which is what we need for buffer eviction.  and
+ * smgrunlink() so that a backend doesn't need to have the relcache entry at
+ * transaction commit, where relations that were dropped in the transaction
+ * are unlinked.
+ *
+ * If smgrwrite() is called and smgr_relpersistence == 0, we check if the
+ * relation file exists locally or not. If it does exist, we assume it's an
+ * unlogged relation and write the page there. Otherwise it must be a
+ * permanent relation, WAL-logged and stored on the page server, and we ignore
+ * the write like we do for permanent relations.
+ *
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -14,15 +45,18 @@
  */
 #include "postgres.h"
 
+#include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
+#include "catalog/pg_class.h"
 #include "pagestore_client.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 #include "access/xlogdefs.h"
 #include "postmaster/interrupt.h"
 #include "storage/bufmgr.h"
+#include "storage/md.h"
 #include "fmgr.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -40,7 +74,6 @@
 #ifdef DEBUG_COMPARE_LOCAL
 #include "access/nbtree.h"
 #include "storage/bufpage.h"
-#include "storage/md.h"
 #include "access/xlog_internal.h"
 
 static char *hexdump_page(char *page);
@@ -59,6 +92,18 @@ char	   *zenith_timeline;
 char	   *zenith_tenant;
 bool		wal_redo = false;
 
+/* unlogged relation build states */
+typedef enum
+{
+	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
+	UNLOGGED_BUILD_PHASE_1,
+	UNLOGGED_BUILD_PHASE_2,
+	UNLOGGED_BUILD_NOT_PERMANENT
+} UnloggedBuildPhase;
+
+static SMgrRelation unlogged_build_rel = NULL;
+static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+
 StringInfoData
 zm_pack_request(ZenithRequest *msg)
 {
@@ -328,13 +373,22 @@ log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 {
 	PGAlignedBlock copied_buffer;
 
-	/* set the flag in the original page, like log_newpage() does. */
-	((PageHeader) page)->pd_flags |= PD_WAL_LOGGED;
-
 	memcpy(copied_buffer.data, page, BLCKSZ);
 	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
 }
 
+/*
+ * Is 'buffer' identical to a freshly initialized empty heap page?
+ */
+static bool
+PageIsEmptyHeapPage(char *buffer)
+{
+	PGAlignedBlock empty_page;
+
+	PageInit((Page) empty_page.data, BLCKSZ, 0);
+
+	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
+}
 
 static void
 zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
@@ -345,18 +399,11 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		return;
 
 	/*
-	 * If the page was not WAL-logged before eviction then we can lose its
-	 * modification. PD_WAL_LOGGED bit is used to mark pages which are
-	 * wal-logged.
-	 *
-	 * See also comments to PD_WAL_LOGGED.
-	 *
-	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the
-	 * whole index. That's duplicative with the WAL-logging that we do here.
-	 * See log_newpage_range() calls.
-	 *
-	 * FIXME: Redoing this record will set the LSN on the page. That could
-	 * mess up the LSN-NSN interlock in GiST index build.
+	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
+	 * changes are not WAL-logged when the changes are made, so this is our
+	 * last chance to log them, otherwise they're lost. That's OK for
+	 * correctness, the non-logged updates are not critical. But we want to
+	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
 	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
 	{
@@ -366,12 +413,13 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
-		elog(SmgrTrace, "FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		ereport(SmgrTrace,
+				(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
 	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
 	{
@@ -388,77 +436,83 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		XLogFlush(recptr);
 		lsn = recptr;
 
-		elog(SmgrTrace, "Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		ereport(SmgrTrace,
+				(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-	else if (!(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED)
-			 && !RecoveryInProgress())
+	else if (lsn == InvalidXLogRecPtr)
 	{
-		XLogRecPtr	recptr;
-
 		/*
-		 * We assume standard page layout here.
+		 * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages,
+		 * and we can just ignore that in Zenith. We do need to remember the new size,
+		 * though, so that smgrnblocks() returns the right answer after the rel has
+		 * been extended. We rely on the relsize cache for that.
 		 *
-		 * But at smgr level we don't really know what kind of a page this is.
-		 * We have filtered visibility map pages and fsm pages above. TODO Do
-		 * we have any special page types?
-		 */
-
-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
-
-		/*
-		 * If we wal-log hint bits, someone could concurrently update page and
-		 * reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
+		 * A completely empty heap page doesn't need to be WAL-logged, either. The
+		 * heapam can leave such a page behind, if e.g. an insert errors out after
+		 * initializing the page, but before it has inserted the tuple and WAL-logged
+		 * the change. When we read the page from the page server, it will come back
+		 * as all-zeros. That's OK, the heapam will initialize an all-zeros page on
+		 * first use.
 		 *
-		 * See comment to FlushBuffer(). The caller must hold a pin on the
-		 * buffer and have share-locked the buffer contents.  (Note: a
-		 * share-lock does not prevent updates of hint bits in the buffer, so
-		 * the page could change while the write is in progress, but we assume
-		 * that that will not invalidate the data written.)
+		 * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies
+		 * that the page was not WAL-logged, and its contents will be lost when it's
+		 * evicted.
 		 */
-		Assert(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED);	/* Should be set by
-																	 * log_newpage */
-
-		/*
-		 * Need to flush it too, so that it gets sent to the Page Server
-		 * before we might need to read it back. It should get flushed
-		 * eventually anyway, at least if there is some other WAL activity, so
-		 * this isn't strictly necessary for correctness. But if there is no
-		 * other WAL activity, the page read might get stuck waiting for the
-		 * record to be streamed out for an indefinite time.
-		 *
-		 * FIXME: Flushing the WAL is expensive. We should track the last
-		 * "evicted" LSN instead, and update it here. Or just kick the
-		 * bgwriter to do the flush, there is no need for us to block here
-		 * waiting for it to finish.
-		 */
-		XLogFlush(recptr);
-		lsn = recptr;
-		elog(SmgrTrace, "Force wal logging of page %u of relation %u/%u/%u.%u, lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		if (PageIsNew(buffer))
+		{
+			ereport(SmgrTrace,
+					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+		else if (PageIsEmptyHeapPage(buffer))
+		{
+			ereport(SmgrTrace,
+					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+		else
+		{
+			ereport(PANIC,
+					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
 	}
 	else
 	{
-		elog(SmgrTrace, "Page %u of relation %u/%u/%u.%u is alread wal logged at lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		ereport(SmgrTrace,
+				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
+
+	/*
+	 * Remember the LSN on this page. When we read the page again, we must
+	 * read the same or newer version of it.
+	 */
 	SetLastWrittenPageLSN(lsn);
 }
 
 
-
 /*
  *	zenith_init() -- Initialize private state
  */
@@ -568,6 +622,29 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/*
+			 * We don't know if it's an unlogged rel stored locally, or permanent
+			 * rel stored in the page server. First check if it exists locally.
+			 * If it does, great. Otherwise check if it exists in the page server.
+			 */
+			if (mdexists(reln, forkNum))
+				return true;
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdexists(reln, forkNum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -615,6 +692,23 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 void
 zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdcreate(reln, forkNum, isRedo);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
 		 reln->smgr_rnode.node.spcNode,
 		 reln->smgr_rnode.node.dbNode,
@@ -648,9 +742,13 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 void
 zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
-#ifdef DEBUG_COMPARE_LOCAL
+	/*
+	 * Might or might not exist locally, depending on whether it's
+	 * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is
+	 * set). Try to unlink, it won't do any harm if the file doesn't
+	 * exist.
+	 */
 	mdunlink(rnode, forkNum, isRedo);
-#endif
 }
 
 /*
@@ -668,7 +766,25 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 {
 	XLogRecPtr	lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdextend(reln, forkNum, blkno, buffer, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
+
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
@@ -691,13 +807,16 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 void
 zenith_open(SMgrRelation reln)
 {
+	/*
+	 * We don't have anything special to do here. Call mdopen() to let md.c
+	 * initialize itself. That's only needed for temporary or unlogged
+	 * relations, but it's dirt cheap so do it always to make sure the md
+	 * fields are initialized, for debugging purposes if nothing else.
+	 */
+	mdopen(reln);
+
 	/* no work */
 	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdopen(reln);
-#endif
 }
 
 /*
@@ -706,13 +825,11 @@ zenith_open(SMgrRelation reln)
 void
 zenith_close(SMgrRelation reln, ForkNumber forknum)
 {
-	/* no work */
-	elog(SmgrTrace, "[ZENITH_SMGR] close noop");
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdclose(reln, forknum);
-#endif
+	/*
+	 * Let md.c close it, if it had it open. Doesn't hurt to do this
+	 * even for permanent relations that have no local storage.
+	 */
+	mdclose(reln, forknum);
 }
 
 /*
@@ -721,6 +838,23 @@ zenith_close(SMgrRelation reln, ForkNumber forknum)
 bool
 zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* probably shouldn't happen, but ignore it */
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdprefetch(reln, forknum, blocknum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	/* not implemented */
 	elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop");
 	return true;
@@ -736,6 +870,25 @@ void
 zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 				 BlockNumber blocknum, BlockNumber nblocks)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* mdwriteback() does nothing if the file doesn't exist */
+			mdwriteback(reln, forknum, blocknum, nblocks);
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdwriteback(reln, forknum, blocknum, nblocks);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	/* not implemented */
 	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
 
@@ -756,6 +909,23 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithGetPageRequest request = {
@@ -796,9 +966,6 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 
 	pfree(resp);
 
-	/* Clear PD_WAL_LOGGED bit stored in WAL record */
-	((PageHeader) buffer)->pd_flags &= ~PD_WAL_LOGGED;
-
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -915,6 +1082,38 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 {
 	XLogRecPtr	lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* This is a bit tricky. Check if the relation exists locally */
+			if (mdexists(reln, forknum))
+			{
+				/* It exists locally. Guess it's unlogged then. */
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+
+				/*
+				 * We could set relpersistence now that we have determined
+				 * that it's local. But we don't dare to do it, because that
+				 * would immediately allow reads as well, which shouldn't
+				 * happen. We could cache it with a different 'relpersistence'
+				 * value, but this isn't performance critical.
+				 */
+				return;
+			}
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	zenith_wallog_page(reln, forknum, blocknum, buffer);
 
 	lsn = PageGetLSN(buffer);
@@ -942,8 +1141,32 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdnblocks(reln, forknum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
+	{
+		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, n_blocks);
 		return n_blocks;
+	}
 
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
@@ -1002,6 +1225,24 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 	XLogRecPtr	lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdtruncate(reln, forknum, nblocks);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
 
 	/*
@@ -1044,6 +1285,24 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 void
 zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdimmedsync(reln, forknum);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1052,6 +1311,178 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }
 
+/*
+ * zenith_start_unlogged_build() -- Starting build operation on a rel.
+ *
+ * Some indexes are built in two phases, by first populating the table with
+ * regular inserts, using the shared buffer cache but skipping WAL-logging,
+ * and WAL-logging the whole relation after it's done. Zenith relies on the
+ * WAL to reconstruct pages, so we cannot use the page server in the
+ * first phase when the changes are not logged.
+ */
+static void
+zenith_start_unlogged_build(SMgrRelation reln)
+{
+	/*
+	 * Currently, there can be only one unlogged relation build operation in
+	 * progress at a time. That's enough for the current usage.
+	 */
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+		elog(ERROR, "unlogged relation build is already in progress");
+	Assert(unlogged_build_rel == NULL);
+
+	ereport(SmgrTrace,
+			(errmsg("starting unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			unlogged_build_rel = reln;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
+		elog(ERROR, "cannot perform unlogged index build, index is not empty ");
+
+	unlogged_build_rel = reln;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
+
+	/* Make the relation look like it's unlogged */
+	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
+
+	/*
+	 * FIXME: should we pass isRedo true to create the tablespace dir if it
+	 * doesn't exist? Is it needed?
+	 */
+	mdcreate(reln, MAIN_FORKNUM, false);
+}
+
+/*
+ * zenith_finish_unlogged_build_phase_1()
+ *
+ * Call this after you have finished populating a relation in unlogged mode,
+ * before you start WAL-logging it.
+ */
+static void
+zenith_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
+		return;
+
+	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
+	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+}
+
+/*
+ * zenith_end_unlogged_build() -- Finish an unlogged rel build.
+ *
+ * Call this after you have finished WAL-logging an relation that was
+ * first populated without WAL-logging.
+ *
+ * This removes the local copy of the rel, since it's now been fully
+ * WAL-logged and is present in the page server.
+ */
+static void
+zenith_end_unlogged_build(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg("ending unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
+	{
+		RelFileNodeBackend rnode;
+
+		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
+		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+		/* Make the relation look permanent again */
+		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
+
+		/* Remove local copy */
+		rnode = reln->smgr_rnode;
+		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
+				 rnode.node.spcNode,
+				 rnode.node.dbNode,
+				 rnode.node.relNode,
+				 forknum);
+
+			forget_cached_relsize(rnode.node, forknum);
+			mdclose(reln, forknum);
+			/* use isRedo == true, so that we drop it immediately */
+			mdunlink(rnode, forknum, true);
+		}
+	}
+
+	unlogged_build_rel = NULL;
+	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+}
+
+static void
+AtEOXact_zenith(XactEvent event, void *arg)
+{
+	switch (event)
+	{
+		case XACT_EVENT_ABORT:
+		case XACT_EVENT_PARALLEL_ABORT:
+
+			/*
+			 * Forget about any build we might have had in progress. The local
+			 * file will be unlinked by smgrDoPendingDeletes()
+			 */
+			unlogged_build_rel = NULL;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+			break;
+
+		case XACT_EVENT_COMMIT:
+		case XACT_EVENT_PARALLEL_COMMIT:
+		case XACT_EVENT_PREPARE:
+		case XACT_EVENT_PRE_COMMIT:
+		case XACT_EVENT_PARALLEL_PRE_COMMIT:
+		case XACT_EVENT_PRE_PREPARE:
+			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+			{
+				unlogged_build_rel = NULL;
+				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 (errmsg("unlogged index build was not properly finished"))));
+			}
+			break;
+	}
+}
+
 static const struct f_smgr zenith_smgr =
 {
 	.smgr_init = zenith_init,
@@ -1069,6 +1500,10 @@ static const struct f_smgr zenith_smgr =
 	.smgr_nblocks = zenith_nblocks,
 	.smgr_truncate = zenith_truncate,
 	.smgr_immedsync = zenith_immedsync,
+
+	.smgr_start_unlogged_build = zenith_start_unlogged_build,
+	.smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1,
+	.smgr_end_unlogged_build = zenith_end_unlogged_build,
 };
 
 
@@ -1086,6 +1521,8 @@ smgr_zenith(BackendId backend, RelFileNode rnode)
 void
 smgr_init_zenith(void)
 {
+	RegisterXactCallback(AtEOXact_zenith, NULL);
+
 	smgr_init_standard();
 	zenith_init();
 }
diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
index 0ba99a128f9..993903b1b18 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/zenith/relsize_cache.c
@@ -127,6 +127,21 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 	}
 }
 
+void
+forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		LWLockRelease(relsize_lock);
+	}
+}
+
 void
 relsize_hash_init(void)
 {
diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index ce5d9c0341e..409acecf42a 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -54,8 +54,6 @@ mask_page_hint_bits(Page page)
 	PageClearFull(page);
 	PageClearHasFreeLinePointers(page);
 
-	phdr->pd_flags &= ~PD_WAL_LOGGED;
-
 	/*
 	 * During replay, if the page LSN has advanced past our XLOG record's LSN,
 	 * we don't mark the page all-visible. See heap_xlog_visible() for
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 0e8672c9e90..dfad28d1f61 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -335,6 +335,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	smgr_start_unlogged_build(index->rd_smgr);
+
 	initGinState(&buildstate.ginstate, index);
 	buildstate.indtuples = 0;
 	memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
@@ -408,6 +410,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
 	ginUpdateStats(index, &buildstate.buildStats, true);
 
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 	/*
 	 * We didn't write WAL records as we built the index, so if WAL-logging is
 	 * required, write all pages to the WAL now.
@@ -418,6 +422,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
+	SetLastWrittenPageLSN(XactLastRecEnd);
+
+	smgr_end_unlogged_build(index->rd_smgr);
 
 	/*
 	 * Return statistics
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 69bbeaf0fb0..55a194a691f 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -40,6 +40,7 @@
 #include "access/tableam.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
+#include "catalog/storage.h"
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
@@ -289,6 +290,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		Buffer		buffer;
 		Page		page;
 
+		smgr_start_unlogged_build(index->rd_smgr);
+
 		/* initialize the root page */
 		buffer = gistNewBuffer(index);
 		Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
@@ -321,6 +324,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			gistFreeBuildBuffers(buildstate.gfbb);
 		}
 
+		smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 		/*
 		 * We didn't write WAL records as we built the index, so if
 		 * WAL-logging is required, write all pages to the WAL now.
@@ -331,6 +336,9 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
+		SetLastWrittenPageLSN(XactLastRecEnd);
+
+		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
 	/* okay, all heap tuples are indexed */
@@ -454,8 +462,13 @@ gist_indexsortbuild(GISTBuildState *state)
 	smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			  pagestate->page, true);
 	if (RelationNeedsWAL(state->indexrel))
-		log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
+	{
+		XLogRecPtr lsn;
+
+		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
+		SetLastWrittenPageLSN(lsn);
+	}
 
 	pfree(pagestate->page);
 	pfree(pagestate);
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 1a1bb4a53f6..43ba03b6eb9 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -866,8 +866,6 @@ gistNewBuffer(Relation r)
 				if (XLogStandbyInfoActive() && RelationNeedsWAL(r))
 					gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page));
 
-				((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
-
 				return buffer;
 			}
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 97b62f74595..8198aa5051a 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -627,7 +627,7 @@ heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(*newrnode, rel->rd_backend);
+	dstrel = smgropen(*newrnode, rel->rd_backend, rel->rd_rel->relpersistence);
 
 	/*
 	 * Since we copy the file directly without looking at the shared buffers,
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index cc4394b1c8d..a7608f4d54c 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -85,6 +85,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	smgr_start_unlogged_build(index->rd_smgr);
+
 	/*
 	 * Initialize the meta page and root pages
 	 */
@@ -105,7 +107,6 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
 	MarkBufferDirty(nullbuffer);
 
-
 	END_CRIT_SECTION();
 
 	UnlockReleaseBuffer(metabuffer);
@@ -131,6 +132,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 
 	SpGistUpdateMetaPage(index);
 
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 	/*
 	 * We didn't write WAL records as we built the index, so if WAL-logging is
 	 * required, write all pages to the WAL now.
@@ -141,6 +144,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
+	SetLastWrittenPageLSN(XactLastRecEnd);
+
+	smgr_end_unlogged_build(index->rd_smgr);
 
 	result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
 	result->heap_tuples = reltuples;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 8d7895af6f3..134c78f12b5 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -239,7 +239,6 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
-	((PageHeader)regbuf->page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -295,7 +294,6 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
-	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -1187,18 +1185,7 @@ log_newpage_range(Relation rel, ForkNumber forkNum,
 			MarkBufferDirty(bufpack[i]);
 		}
 
-		/*
-		 * Zenith forces WAL logging of evicted pages,
-		 * so it can happen that in some cases when pages are first
-		 * modified and then WAL logged (for example building GiST/GiN
-		 * indexes) there are no more pages which need to be WAL logged at
-		 * the end of build procedure. As far as XLogInsert throws error
-		 * if not records were inserted, we need to reset the insert state.
-		 */
-		if (nbufs > 0)
-			recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
-		else
-			XLogResetInsertion();
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
 
 		for (i = 0; i < nbufs; i++)
 		{
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index baf4dbed4aa..c5d03cd4b83 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -463,7 +463,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	Assert(blkno != P_NEW);
 
 	/* Open the relation at smgr level */
-	smgr = smgropen(rnode, InvalidBackendId);
+	smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 	/*
 	 * Create the target file if it doesn't already exist.  This lets us cope
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 8f74a3efdeb..d40183aff74 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -143,7 +143,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 			return NULL;		/* placate compiler */
 	}
 
-	srel = smgropen(rnode, backend);
+	srel = smgropen(rnode, backend, relpersistence);
 	smgrcreate(srel, MAIN_FORKNUM, false);
 
 	if (needs_wal)
@@ -667,7 +667,7 @@ smgrDoPendingDeletes(bool isCommit)
 			{
 				SMgrRelation srel;
 
-				srel = smgropen(pending->relnode, pending->backend);
+				srel = smgropen(pending->relnode, pending->backend, 0);
 
 				/* allocate the initial array, or extend it, if needed */
 				if (maxrels == 0)
@@ -748,7 +748,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 		BlockNumber total_blocks = 0;
 		SMgrRelation srel;
 
-		srel = smgropen(pendingsync->rnode, InvalidBackendId);
+		srel = smgropen(pendingsync->rnode, InvalidBackendId, 0);
 
 		/*
 		 * We emit newpage WAL records for smaller relations.
@@ -957,7 +957,7 @@ smgr_redo(XLogReaderState *record)
 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
 		SMgrRelation reln;
 
-		reln = smgropen(xlrec->rnode, InvalidBackendId);
+		reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
 	else if (info == XLOG_SMGR_TRUNCATE)
@@ -970,7 +970,7 @@ smgr_redo(XLogReaderState *record)
 		int			nforks = 0;
 		bool		need_fsm_vacuum = false;
 
-		reln = smgropen(xlrec->rnode, InvalidBackendId);
+		reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 		/*
 		 * Forcibly create relation if it doesn't exist (which suggests that
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index bb0ea30dcd0..c6ba79439f3 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -700,12 +700,6 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("cannot create temporary table within security-restricted operation")));
 
-	if (stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED)
-	{
-		/* Unlogged tables are not supported by Zenith */
-		stmt->relation->relpersistence = RELPERSISTENCE_PERMANENT;
-	}
-
 	/*
 	 * Determine the lockmode to use when scanning parents.  A self-exclusive
 	 * lock is needed here.
@@ -14239,7 +14233,7 @@ index_copy_data(Relation rel, RelFileNode newrnode)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(newrnode, rel->rd_backend);
+	dstrel = smgropen(newrnode, rel->rd_backend, rel->rd_rel->relpersistence);
 
 	/*
 	 * Since we copy the file directly without looking at the shared buffers,
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1e73dbbcc58..6168b069b26 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -796,7 +796,7 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 {
 	bool		hit;
 
-	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 	Assert(InRecovery);
 
@@ -1625,11 +1625,6 @@ MarkBufferDirty(Buffer buffer)
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
-	/*
-	 * Clear PD_WAL_LOGGED flag so that if dirty page is evicted from page pool
-	 * before been WAL logged, FPI WAL record will be enforced.
-	 */
-	((PageHeader)BufferGetPage(buffer))->pd_flags &= ~PD_WAL_LOGGED;
 }
 
 /*
@@ -2045,15 +2040,6 @@ BufferSync(int flags)
 			item->blockNum = bufHdr->tag.blockNum;
 		}
 
-		/* Zenith XXX
-		 * Consider marking this page as not WAL-logged,
-		 * so that pagestore_smgr issued a log record before eviction
-		 * and persisted hint changes.
-		 * TODO: check performance impacts of this approach
-		 * since extra wal-logging may worsen the performance.
-		 */
-		//((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
-
 		UnlockBufHdr(bufHdr, buf_state);
 
 		/* Check for barrier events in case NBuffers is large. */
@@ -2900,7 +2886,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 
 	/* Find smgr relation for buffer */
 	if (reln == NULL)
-		reln = smgropen(buf->tag.rnode, InvalidBackendId);
+		reln = smgropen(buf->tag.rnode, InvalidBackendId, 0);
 
 	TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
 										buf->tag.blockNum,
@@ -4901,7 +4887,7 @@ IssuePendingWritebacks(WritebackContext *context)
 		i += ahead;
 
 		/* and finally tell the kernel to write the data to storage */
-		reln = smgropen(tag.rnode, InvalidBackendId);
+		reln = smgropen(tag.rnode, InvalidBackendId, 0);
 		smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
 	}
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 04b3558ea33..b9811cc7327 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -215,7 +215,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 		/* Find smgr relation for buffer */
-		oreln = smgropen(bufHdr->tag.rnode, MyBackendId);
+		oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP);
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 48dc7bde265..3616846ad07 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -427,7 +427,6 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 
 	pageSize = PageGetPageSize(tempPage);
 	memcpy((char *) oldPage, (char *) tempPage, pageSize);
-	((PageHeader)oldPage)->pd_flags &= ~PD_WAL_LOGGED;
 	pfree(tempPage);
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 72f1494c7ab..0d834c69ac5 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1058,7 +1058,7 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
 	srels = palloc(sizeof(SMgrRelation) * ndelrels);
 	for (i = 0; i < ndelrels; i++)
 	{
-		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId, 0);
 
 		if (isRedo)
 		{
@@ -1336,7 +1336,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 int
 mdsyncfiletag(const FileTag *ftag, char *path)
 {
-	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0);
 	File		file;
 	bool		need_to_close;
 	int			result,
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 8d2b6b73b29..10a6f65c118 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -135,9 +135,14 @@ smgr(BackendId backend, RelFileNode rnode)
  *	smgropen() -- Return an SMgrRelation object, creating it if need be.
  *
  *		This does not attempt to actually open the underlying file.
+ *
+ * The caller should pass the value of pg_class.relpersistence, if they know
+ * it, or 0 if unknown. Some operations, like smgrwrite() and smgrunlink()
+ * are allowed when relpersistence is not known, but others like smgrread()
+ * require it.
  */
 SMgrRelation
-smgropen(RelFileNode rnode, BackendId backend)
+smgropen(RelFileNode rnode, BackendId backend, char relpersistence)
 {
 	RelFileNodeBackend brnode;
 	SMgrRelation reln;
@@ -168,6 +173,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 		/* hash_search already filled in the lookup key */
 		reln->smgr_owner = NULL;
 		reln->smgr_targblock = InvalidBlockNumber;
+		reln->smgr_relpersistence = relpersistence;
 		for (int i = 0; i <= MAX_FORKNUM; ++i)
 			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
 
@@ -179,6 +185,17 @@ smgropen(RelFileNode rnode, BackendId backend)
 		/* it has no owner yet */
 		dlist_push_tail(&unowned_relns, &reln->node);
 	}
+	else
+	{
+		/*
+		 * If the caller passed a valid 'relpersistence', and it was unknown
+		 * before, update it.
+		 */
+		if (reln->smgr_relpersistence == 0)
+			reln->smgr_relpersistence = relpersistence;
+		else
+			Assert(relpersistence == 0 || reln->smgr_relpersistence == relpersistence);
+	}
 
 	return reln;
 }
@@ -652,6 +669,30 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 	(*reln->smgr).smgr_immedsync(reln, forknum);
 }
 
+/*
+ * Zenith-added functions to mark the phases of an unlogged index build.
+ */
+void
+smgr_start_unlogged_build(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_start_unlogged_build)
+		(*reln->smgr).smgr_start_unlogged_build(reln);
+}
+
+void
+smgr_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_finish_unlogged_build_phase_1)
+		(*reln->smgr).smgr_finish_unlogged_build_phase_1(reln);
+}
+
+void
+smgr_end_unlogged_build(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_end_unlogged_build)
+		(*reln->smgr).smgr_end_unlogged_build(reln);
+}
+
 /*
  * AtEOXact_SMgr
  *
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index a02592fc0c2..9e1620922ce 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -74,6 +74,7 @@
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
+#include "catalog/pg_class.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
@@ -530,7 +531,7 @@ BeginRedoForBlock(StringInfo input_message)
 
 	MemoryContextSwitchTo(oldcxt);
 
-	reln = smgropen(rnode, InvalidBackendId);
+	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
 		reln->smgr_cached_nblocks[forknum] < blknum + 1)
 	{
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index ade36f28be5..33474e01941 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -271,12 +271,15 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS)
  * is no check here or at the call sites for that.
  */
 static int64
-calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum)
+calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum, char relpersistence)
 {
-	SMgrRelation  srel = smgropen(*rfn, backend);
-	if (smgrexists(srel, forknum))	{
+	SMgrRelation  srel = smgropen(*rfn, backend, relpersistence);
+
+	if (smgrexists(srel, forknum))
+	{
 		BlockNumber n = smgrnblocks(srel, forknum);
-		return (int64)n*BLCKSZ;
+
+		return (int64) n * BLCKSZ;
 	}
 	return 0;
 }
@@ -302,7 +305,8 @@ pg_relation_size(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 
 	size = calculate_relation_size(&(rel->rd_node), rel->rd_backend,
-								   forkname_to_number(text_to_cstring(forkName)));
+								   forkname_to_number(text_to_cstring(forkName)),
+								   rel->rd_rel->relpersistence);
 
 	relation_close(rel, AccessShareLock);
 
@@ -327,7 +331,8 @@ calculate_toast_table_size(Oid toastrelid)
 	/* toast heap size, including FSM and VM size */
 	for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 		size += calculate_relation_size(&(toastRel->rd_node),
-										toastRel->rd_backend, forkNum);
+										toastRel->rd_backend, forkNum,
+										toastRel->rd_rel->relpersistence);
 
 	/* toast index size, including FSM and VM size */
 	indexlist = RelationGetIndexList(toastRel);
@@ -341,7 +346,8 @@ calculate_toast_table_size(Oid toastrelid)
 									AccessShareLock);
 		for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 			size += calculate_relation_size(&(toastIdxRel->rd_node),
-											toastIdxRel->rd_backend, forkNum);
+											toastIdxRel->rd_backend, forkNum,
+											toastIdxRel->rd_rel->relpersistence);
 
 		relation_close(toastIdxRel, AccessShareLock);
 	}
@@ -370,7 +376,8 @@ calculate_table_size(Relation rel)
 	 */
 	for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 		size += calculate_relation_size(&(rel->rd_node), rel->rd_backend,
-										forkNum);
+										forkNum,
+										rel->rd_rel->relpersistence);
 
 	/*
 	 * Size of toast relation
@@ -410,7 +417,8 @@ calculate_indexes_size(Relation rel)
 			for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 				size += calculate_relation_size(&(idxRel->rd_node),
 												idxRel->rd_backend,
-												forkNum);
+												forkNum,
+												idxRel->rd_rel->relpersistence);
 
 			relation_close(idxRel, AccessShareLock);
 		}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 6d140786c74..40fcdf6d871 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -227,6 +227,8 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
 
+extern void MarkBufferPermanent(Buffer buffer);
+
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 extern bool ConditionalLockBuffer(Buffer buffer);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 6704f69f328..c86ccdaf608 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -182,24 +182,7 @@ typedef PageHeaderData *PageHeader;
 #define PD_ALL_VISIBLE		0x0004	/* all tuples on page are visible to
 									 * everyone */
 
-/* Zenith XXX:
- * Some operations in PostgreSQL are not WAL-logged at all (i.e. hint bits)
- * or delay wal-logging till the end of operation (i.e. index build).
- *
- * So if such page is evicted, we will lose the update.
- * To fix it, we introduce PD_WAL_LOGGED bit to track whether the page was wal-logged.
- * If page is evicted before it has been wal-logged, then pagestore_smgr creates FPI for it.
- *
- * List of such operations:
- * - GIN/GiST/SP-GiST index build
- * - page and heaptuple hint bits
- * - Clearing visibility map bits
- * - FSM changes
- * - ???
- */
-#define PD_WAL_LOGGED       0x0008  /* Page is wal-logged */
-#define PD_VALID_FLAG_BITS	0x000F	/* OR of all valid pd_flags bits */
-
+#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a7c98c7e7fe..c08eaed6179 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -43,6 +43,9 @@ typedef struct SMgrRelationData
 	/* rnode is the hashtable lookup key, so it must be first! */
 	RelFileNodeBackend smgr_rnode;	/* relation physical identifier */
 
+	/* copy of pg_class.relpersistence, or 0 if not known */
+	char		smgr_relpersistence;
+
 	/* pointer to owning pointer, or NULL if none */
 	struct SMgrRelationData **smgr_owner;
 
@@ -115,6 +118,10 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+
+	void		(*smgr_start_unlogged_build) (SMgrRelation reln);
+	void		(*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln);
+	void		(*smgr_end_unlogged_build) (SMgrRelation reln);
 } f_smgr;
 
 typedef void (*smgr_init_hook_type) (void);
@@ -132,7 +139,7 @@ extern const f_smgr *smgr_standard(BackendId backend, RelFileNode rnode);
 extern const f_smgr *smgr(BackendId backend, RelFileNode rnode);
 
 extern void smgrinit(void);
-extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
+extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend, char relpersistence);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
 extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
 extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln);
@@ -159,4 +166,8 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void AtEOXact_SMgr(void);
 
+extern void smgr_start_unlogged_build(SMgrRelation reln);
+extern void	smgr_finish_unlogged_build_phase_1(SMgrRelation reln);
+extern void smgr_end_unlogged_build(SMgrRelation reln);
+
 #endif							/* SMGR_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index f2d53c92e50..86b9e28edf5 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -544,7 +544,8 @@ static inline SMgrRelation
 RelationGetSmgr(Relation rel)
 {
 	if (unlikely(rel->rd_smgr == NULL))
-		smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_node, rel->rd_backend));
+		smgrsetowner(&(rel->rd_smgr),
+					 smgropen(rel->rd_node, rel->rd_backend, rel->rd_rel->relpersistence));
 	return rel->rd_smgr;
 }
 #endif							/* !FRONTEND */
@@ -559,7 +560,8 @@ RelationGetSmgr(Relation rel)
 #define RelationOpenSmgr(relation) \
 	do { \
 		if ((relation)->rd_smgr == NULL) \
-			smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \
+			smgrsetowner(&((relation)->rd_smgr), \
+						 smgropen((relation)->rd_node, (relation)->rd_backend, (relation)->rd_rel->relpersistence)); \
 	} while (0)
 
 /*

From 28e1bcf27c3ecf2019766dfed5eaade8f07bb12a Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 14 Oct 2021 15:03:14 +0300
Subject: [PATCH 064/214] Fix queue cleanup in proposer (#93)

Queue was moved further than truncateLsn, when quorumLsn matched end of wal record in the middle of queue message. Fix cleanup of unreceived messages.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 src/backend/replication/walproposer.c | 39 +++++++++++++++++++--------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 35624a77352..526602e953b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -77,8 +77,8 @@ static ProposerGreeting proposerGreeting;
 static WaitEventSet *waitEvents;
 static AppendResponse lastFeedback;
 /*
- *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
- *  + 1 of last chunk streamed to everyone)
+ *  minimal LSN which may be needed for recovery of some safekeeper,
+ *  record-aligned (first record which might not yet received by someone).
  */
 static XLogRecPtr truncateLsn;
 static XLogRecPtr candidateTruncateLsn;
@@ -360,14 +360,10 @@ HandleWalKeeperResponse(void)
 									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 
-
-	/* Cleanup message queue */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1))
+	/* Advance truncateLsn */
+	WalMessage *msgQueueAck = msgQueueHead;
+	while (msgQueueAck != NULL && msgQueueAck->ackMask == ((1 << n_walkeepers) - 1))
 	{
-		WalMessage *msg = msgQueueHead;
-
-		msgQueueHead = msg->next;
-
 		/*
 		 * This piece is received by everyone; try to advance truncateLsn, but
 		 * hold it back to nearest commitLsn. Thus we will always start
@@ -383,22 +379,39 @@ HandleWalKeeperResponse(void)
 		 * read from WAL and send are plain sheets of bytes, but safekeepers
 		 * ack only on commit boundaries.
 		 */
-		if (msg->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
+		if (msgQueueAck->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
 		{
 			truncateLsn = minQuorumLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
-		else if (msg->req.endLsn >= candidateTruncateLsn &&
+		else if (msgQueueAck->req.endLsn >= candidateTruncateLsn &&
 				 candidateTruncateLsn != InvalidXLogRecPtr)
 		{
 			truncateLsn = candidateTruncateLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
+
+		msgQueueAck = msgQueueAck->next;
+	}
+
+	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	{
+		WalMessage *msg = msgQueueHead;
+		msgQueueHead = msg->next;
+
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
 	if (!msgQueueHead)			/* queue is empty */
 		msgQueueTail = NULL;
+	/* truncateLsn always points to the first chunk in the queue */
+	if (msgQueueHead)
+	{
+		/* Max takes care of special 0-sized messages */
+		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
+			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
+	}
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
@@ -1370,6 +1383,10 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes != quorum)
 				{
+					/* Can't start streaming earlier than truncateLsn */
+					wk->startStreamingAt = truncateLsn;
+					Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+
 					/*
 					 * We are already streaming WAL: send all pending messages
 					 * to the attached walkeeper

From bef750f1dc4ee4bf9f61e1bdfc9c11e19e6a664a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 19 Oct 2021 09:47:32 +0300
Subject: [PATCH 065/214] Support read-only nodes

This changes the format of the 'zenith.signal' file. It is now a
human-readable text file, with one line like "PREV LSN: 0/1234568", or
"PREV LSN: none" if the prev LSN is not known, or "PREV LSN: invalid" if
starting up in read-write is not allowed.

Also, if 'zenith.signal' is present, don't try to read the checkpoint
record from the WAL. Trust the copy in pg_control, instead.
---
 contrib/zenith/pagestore_smgr.c       |   9 +-
 src/backend/access/transam/xlog.c     | 193 ++++++++++++++++++++------
 src/include/access/xlog.h             |   2 +
 src/include/replication/walproposer.h |   2 +-
 4 files changed, 156 insertions(+), 50 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 99914365428..cfb1068e122 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -560,13 +560,14 @@ zenith_get_request_lsn(bool *latest)
 
 	if (RecoveryInProgress())
 	{
+		*latest = false;
 		lsn = GetXLogReplayRecPtr(NULL);
 		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-		lsn = InvalidXLogRecPtr;
 	}
 	else if (am_walsender)
 	{
+		*latest = true;
 		lsn = InvalidXLogRecPtr;
 		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
 	}
@@ -579,6 +580,7 @@ zenith_get_request_lsn(bool *latest)
 		 * pages modified by later WAL records must still in the buffer cache,
 		 * so our request cannot concern those.
 		 */
+		*latest = true;
 		lsn = GetLastWrittenPageLSN();
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
@@ -602,11 +604,6 @@ zenith_get_request_lsn(bool *latest)
 		}
 	}
 
-	/*
-	 * FIXME: In read-only mode, we would need to set *latest=false here. But
-	 * we don't support read-only mode at the moment
-	 */
-	*latest = true;
 	return lsn;
 }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 2eee4a9dbdc..6ae94d2ecac 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -281,6 +281,13 @@ bool		InArchiveRecovery = false;
 static bool standby_signal_file_found = false;
 static bool recovery_signal_file_found = false;
 
+/*
+ * Variables read from 'zenith.signal' file.
+ */
+bool		ZenithRecoveryRequested = false;
+XLogRecPtr	zenithLastRec = InvalidXLogRecPtr;
+bool		zenithWriteOk = false;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
@@ -5551,6 +5558,81 @@ readRecoverySignalFile(void)
 				 errmsg("standby mode is not supported by single-user servers")));
 }
 
+static void
+readZenithSignalFile(void)
+{
+	int			fd;
+
+	fd = BasicOpenFile(ZENITH_SIGNAL_FILE, O_RDONLY | PG_BINARY);
+	if (fd >= 0)
+	{
+		struct stat statbuf;
+		char	   *content;
+		char		prev_lsn_str[20];
+
+		/* Slurp the file into a string */
+		if (stat(ZENITH_SIGNAL_FILE, &statbuf) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m",
+							ZENITH_SIGNAL_FILE)));
+		content = palloc(statbuf.st_size + 1);
+		if (read(fd, content, statbuf.st_size) != statbuf.st_size)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							ZENITH_SIGNAL_FILE)));
+		content[statbuf.st_size] = '\0';
+
+		/* Parse it */
+		if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE)));
+
+		if (strcmp(prev_lsn_str, "invalid") == 0)
+		{
+			/* No prev LSN. Forbid starting up in read-write mode */
+			zenithLastRec = InvalidXLogRecPtr;
+			zenithWriteOk = false;
+		}
+		else if (strcmp(prev_lsn_str, "none") == 0)
+		{
+			/*
+			 * The page server had no valid prev LSN, but assured that it's ok
+			 * to start without it. This happens when you start the compute
+			 * node for the first time on a new branch.
+			 */
+			zenithLastRec = InvalidXLogRecPtr;
+			zenithWriteOk = true;
+		}
+		else
+		{
+			uint32		hi,
+						lo;
+
+			if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE)));
+			zenithLastRec = ((uint64) hi) << 32 | lo;
+
+			/* If prev LSN is given, it better be valid */
+			if (zenithLastRec == InvalidXLogRecPtr)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid prev-LSN in file \"%s\"", ZENITH_SIGNAL_FILE)));
+			zenithWriteOk = true;
+		}
+		ZenithRecoveryRequested = true;
+		close(fd);
+
+		elog(LOG,
+			 "[ZENITH] found 'zenith.signal' file. setting prev LSN to %X/%X",
+			 LSN_FORMAT_ARGS(zenithLastRec));
+	}
+}
+
 static void
 validateRecoveryParameters(void)
 {
@@ -6512,7 +6594,6 @@ StartupXLOG(void)
 	bool		reachedRecoveryTarget = false;
 	bool		haveBackupLabel = false;
 	bool		haveTblspcMap = false;
-	bool        skipLastRecordReread = false;
 	XLogRecPtr	RecPtr,
 				checkPointLoc,
 				EndOfLog;
@@ -6537,10 +6618,15 @@ StartupXLOG(void)
 		   CurrentResourceOwner == AuxProcessResourceOwner);
 	CurrentResourceOwner = AuxProcessResourceOwner;
 
+	/*
+	 * Read zenith.signal before anything else.
+	 */
+	readZenithSignalFile();
+
 	/*
 	 * Check that contents look valid.
 	 */
-	if (!XRecOffIsValid(ControlFile->checkPoint))
+	if (!XRecOffIsValid(ControlFile->checkPoint) && !ZenithRecoveryRequested)
 		ereport(FATAL,
 				(errmsg("control file contains invalid checkpoint location")));
 
@@ -6670,6 +6756,9 @@ StartupXLOG(void)
 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
 			ereport(LOG,
 					(errmsg("starting point-in-time recovery to earliest consistent point")));
+		else if (ZenithRecoveryRequested)
+			ereport(LOG,
+					(errmsg("starting zenith recovery")));
 		else
 			ereport(LOG,
 					(errmsg("starting archive recovery")));
@@ -6800,6 +6889,29 @@ StartupXLOG(void)
 		/* set flag to delete it later */
 		haveBackupLabel = true;
 	}
+	else if (ZenithRecoveryRequested)
+	{
+		/*
+		 * Zenith hacks to spawn compute node without WAL.  Pretend that we
+		 * just finished reading the record that started at 'zenithLastRec'
+		 * and ended at checkpoint.redo
+		 */
+		elog(LOG, "starting with zenith basebackup at LSN %X/%X, prev %X/%X",
+			 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo),
+			 LSN_FORMAT_ARGS(zenithLastRec));
+
+		checkPointLoc = zenithLastRec;
+		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		EndRecPtr = ControlFile->checkPointCopy.redo;
+
+		memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint));
+		wasShutdown = true;
+
+		/* Initialize expectedTLEs, like ReadRecord() does */
+		expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID);
+
+		XLogBeginRead(xlogreader, EndRecPtr);
+	}
 	else
 	{
 		/*
@@ -7057,30 +7169,10 @@ StartupXLOG(void)
 
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 	doPageWrites = lastFullPageWrites;
-	if (RecPtr < checkPoint.redo)
-	{
-		int fd = BasicOpenFile("zenith.signal", O_RDWR | PG_BINARY);
-		if (fd >= 0) {
-			XLogRecPtr prevRecPtr = 0;
-			if ((size_t)read(fd, &prevRecPtr, sizeof prevRecPtr) != sizeof(prevRecPtr)) {
-				elog(LOG, "can't read previous record position from zenith.signal file: %m");
-			}
-			LastRec = prevRecPtr;
-			/* Zenith hacks to spawn compute node without WAL */
-			EndRecPtr = RecPtr = checkPoint.redo;
-			skipLastRecordReread = true;
-			close(fd);
 
-			elog(LOG,
-				"[ZENITH] found 'zenith.signal' file. Setting prevRecPtr to %X/%X",
-				LSN_FORMAT_ARGS(prevRecPtr));
-		}
-		else
-		{
-			ereport(PANIC,
-					(errmsg("invalid redo in checkpoint record")));
-		}
-	}
+	if (RecPtr < checkPoint.redo && !ZenithRecoveryRequested)
+		ereport(PANIC,
+				(errmsg("invalid redo in checkpoint record")));
 
 	/*
 	 * Check whether we need to force recovery from WAL.  If it appears to
@@ -7745,25 +7837,40 @@ StartupXLOG(void)
 	 * that and continue after it.  In all other cases, re-fetch the last
 	 * valid or last applied record, so we can identify the exact endpoint of
 	 * what we consider the valid portion of WAL.
+	 *
+	 * When starting from a zenith base backup, we don't have WAL. Initialize
+	 * the WAL page where we will start writing new records from scratch,
+	 * instead.
 	 */
-
-	/*
-	 * We use the last WAL page to initialize the WAL for writing,
-	 * so we better have it in memory.
-	 */
-	if (skipLastRecordReread)
+	if (ZenithRecoveryRequested)
 	{
-		int offs = (EndRecPtr % XLOG_BLCKSZ);
-		XLogRecPtr lastPage = EndRecPtr - offs;
-		int idx = XLogRecPtrToBufIdx(lastPage);
-		XLogPageHeader xlogPageHdr = (XLogPageHeader)(XLogCtl->pages + idx*XLOG_BLCKSZ);
-		xlogPageHdr->xlp_pageaddr = lastPage;
-		xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
-		xlogPageHdr->xlp_tli = ThisTimeLineID;
-		xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD;
-		xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
-		readOff = XLogSegmentOffset(lastPage, wal_segment_size);
-		elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
+		if (!zenithWriteOk)
+		{
+			/*
+			 * We cannot start generating new WAL if we don't have a valid prev-LSN
+			 * to use for the first new WAL record. (Shouldn't happen.)
+			 */
+			ereport(ERROR,
+					(errmsg("cannot start in read-write mode from this base backup")));
+		}
+		else
+		{
+			int			offs = (EndRecPtr % XLOG_BLCKSZ);
+			XLogRecPtr	lastPage = EndRecPtr - offs;
+			int			idx = XLogRecPtrToBufIdx(lastPage);
+			XLogPageHeader xlogPageHdr = (XLogPageHeader) (XLogCtl->pages + idx * XLOG_BLCKSZ);
+
+			xlogPageHdr->xlp_pageaddr = lastPage;
+			xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+			xlogPageHdr->xlp_tli = ThisTimeLineID;
+			xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD; // FIXME
+			xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
+			readOff = XLogSegmentOffset(lastPage, wal_segment_size);
+
+			elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
+
+			// FIXME: should we unlink zenith.signal?
+		}
 	}
 	else
 	{
@@ -7966,7 +8073,7 @@ StartupXLOG(void)
 		/* Copy the valid part of the last block, and zero the rest */
 		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
 		len = EndOfLog % XLOG_BLCKSZ;
-		if (!skipLastRecordReread)
+		if (!ZenithRecoveryRequested)
 			memcpy(page, xlogreader->readBuf, len);
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 8b8b14d2fd0..986eb957570 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -403,6 +403,8 @@ extern SessionBackupState get_backup_status(void);
 #define TABLESPACE_MAP			"tablespace_map"
 #define TABLESPACE_MAP_OLD		"tablespace_map.old"
 
+#define ZENITH_SIGNAL_FILE		"zenith.signal"
+
 /* files to signal promotion to primary */
 #define PROMOTE_SIGNAL_FILE		"promote"
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 222faaea41d..c6ece7a8ec7 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -263,7 +263,7 @@ struct WalMessage
 
 	/* PHANTOM FIELD:
 	 *
-	 * All WalMessages are allocated with exactly (size - sizeof(WalKeeperRequest)) additional bytes
+	 * All WalMessages are allocated with exactly (size - sizeof(AppendRequestHeader)) additional bytes
 	 * after them, containing the body of the message. This allocation is done in `CreateMessage`
 	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
 };

From 1b45bf61875f757b3d76a3bb309fe6f4fac54726 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 21 Oct 2021 16:28:52 +0300
Subject: [PATCH 066/214] Remove a drop of syncSafekeepers complexity.

---
 src/backend/replication/walproposer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 526602e953b..7cf0414a4cc 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1426,8 +1426,8 @@ AdvancePollState(int i, uint32 events)
 
 						if (syncSafekeepers)
 						{
-							/* Wait until all walkeepers are synced */
-							WalProposerLoop();
+							/* keep polling until all walkeepers are synced */
+							return;
 						}
 					}
 					else if (syncSafekeepers)

From e63789c3cb7e35326602a2b5a9cb8b21741b6019 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 21 Oct 2021 16:36:28 +0300
Subject: [PATCH 067/214] Fix compiler warning.

warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
  364 |  WalMessage *msgQueueAck = msgQueueHead;
---
 src/backend/replication/walproposer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7cf0414a4cc..5bb6322d29d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -339,6 +339,7 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
+	WalMessage *msgQueueAck;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
@@ -361,7 +362,7 @@ HandleWalKeeperResponse(void)
 	}
 
 	/* Advance truncateLsn */
-	WalMessage *msgQueueAck = msgQueueHead;
+	msgQueueAck = msgQueueHead;
 	while (msgQueueAck != NULL && msgQueueAck->ackMask == ((1 << n_walkeepers) - 1))
 	{
 		/*

From b70693b69f7f96dee21d33513100311cbaf54f94 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 18 Sep 2021 17:45:56 +0300
Subject: [PATCH 068/214] Implement backpressure for compute node to avoid WAL
 overflow

---
 contrib/zenith/pagestore_smgr.c         |  5 ++-
 src/backend/access/transam/xloginsert.c | 43 +++++++++++++++++++++++
 src/backend/replication/walproposer.c   | 45 +++++++++++++++++++++++--
 src/backend/replication/walsender.c     | 29 ++++++++++++++++
 src/backend/storage/buffer/bufmgr.c     |  1 -
 src/backend/utils/activity/wait_event.c |  3 ++
 src/backend/utils/misc/guc.c            | 24 +++++++++++++
 src/include/access/xlogdefs.h           |  8 +++++
 src/include/access/xloginsert.h         |  3 ++
 src/include/replication/walproposer.h   |  2 ++
 src/include/replication/walsender.h     |  3 +-
 src/include/utils/wait_event.h          |  3 +-
 12 files changed, 160 insertions(+), 9 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index cfb1068e122..25ad896491b 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -51,16 +51,16 @@
 #include "access/xlog_internal.h"
 #include "catalog/pg_class.h"
 #include "pagestore_client.h"
-#include "storage/relfilenode.h"
+#include "pagestore_client.h"
 #include "storage/smgr.h"
 #include "access/xlogdefs.h"
 #include "postmaster/interrupt.h"
+#include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/md.h"
 #include "fmgr.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "replication/walsender.h"
 #include "catalog/pg_tablespace_d.h"
 
 /*
@@ -781,7 +781,6 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
-
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 134c78f12b5..935298ac810 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -29,9 +29,11 @@
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "replication/origin.h"
+#include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
+#include "utils/wait_event.h"
 
 /* Buffer size required to store a compressed version of backup block image */
 #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
@@ -61,6 +63,10 @@ typedef struct
 	char		compressed_page[PGLZ_MAX_BLCKSZ];
 } registered_buffer;
 
+/* GUCs */
+int			max_replication_write_lag;
+int			max_replication_flush_lag;
+
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
 static int	max_registered_block_id = 0;	/* highest block_id + 1 currently
@@ -115,6 +121,9 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
+/* Timeout in milliseconds for delaying backend WAL inserts to avoid WAL overflow */
+#define BACK_PRESSURE_TIMEOUT 100
+#define MB ((XLogRecPtr)1024*1024)
 /*
  * Begin constructing a WAL record. This must be called before the
  * XLogRegister* functions and XLogInsert().
@@ -133,6 +142,40 @@ XLogBeginInsert(void)
 	if (begininsert_called)
 		elog(ERROR, "XLogBeginInsert was already called");
 
+	if (max_replication_write_lag != 0 || max_replication_flush_lag != 0)
+	{
+		uint64 slept = 0;
+
+		/* Suspend writes until replicas catch up */
+		while (true)
+		{
+			XLogRecPtr replicaWriteLsn;
+			XLogRecPtr replicaFlushLsn;
+			XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn);
+
+			if ((replicaWriteLsn != UnknownXLogRecPtr
+				 && myFlushLsn > replicaWriteLsn + max_replication_write_lag*MB) ||
+				(replicaFlushLsn != UnknownXLogRecPtr
+				 && myFlushLsn > replicaFlushLsn + max_replication_flush_lag*MB))
+			{
+				(void) WaitLatch(MyLatch,
+								 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+								 BACK_PRESSURE_TIMEOUT,
+								 WAIT_EVENT_BACK_PRESSURE);
+				ResetLatch(MyLatch);
+				slept += BACK_PRESSURE_TIMEOUT;
+			}
+			else
+				break;
+		}
+
+		// XXX: INFO will cause a lot of regression tests to fail.
+		if (slept > 0)
+			elog(DEBUG1, "slept for " UINT64_FORMAT " ms while waiting for all replicas to catch up", slept);
+	}
+
 	begininsert_called = true;
 }
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5bb6322d29d..53c7b06931a 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -133,6 +133,23 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	}
 }
 
+/*
+ * Get minimum of disk consistent LSNs of all safekeepers
+ */
+static XLogRecPtr
+CalculateDiskConsistentLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
+		{
+			lsn = walkeeper[i].feedback.diskConsistentLsn;
+		}
+	}
+	return lsn;
+}
+
 /* Initializes the internal event set, provided that it is currently null */
 static void
 InitEventSet(void)
@@ -339,16 +356,27 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	diskConsistentLsn;
 	WalMessage *msgQueueAck;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	if (minQuorumLsn > lastFeedback.flushLsn)
+	diskConsistentLsn = CalculateDiskConsistentLsn();
+
+	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
 	{
-		lastFeedback.flushLsn = minQuorumLsn;
+
+		if (minQuorumLsn > lastFeedback.flushLsn)
+			lastFeedback.flushLsn = minQuorumLsn;
+
+		lastFeedback.diskConsistentLsn = diskConsistentLsn;
+
 		/* advance the replication slot */
 		if (!syncSafekeepers)
-			ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+			ProcessStandbyReply(lastFeedback.diskConsistentLsn,
+								lastFeedback.flushLsn,
+								InvalidXLogRecPtr, GetCurrentTimestamp(), false);
 	}
+
 	CombineHotStanbyFeedbacks(&hsFeedback);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
 	{
@@ -1056,6 +1084,17 @@ WalProposerPoll(void)
 			ResetLatch(MyLatch);
 			break;
 		}
+		if (rc == 0) /* timeout expired: poll state */
+		{
+			/*
+			 * If no WAL was generated during timeout (and we have already
+			 * collected the quorum), then send pool message
+			 */
+			if (lastSentLsn != InvalidXLogRecPtr)
+			{
+				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+			}
+		}
 	}
 }
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 20a38385a43..ecda7c45791 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3778,3 +3778,32 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 	Assert(time != 0);
 	return now - time;
 }
+
+/*
+ * Get minimal write and flush LSN among all live replicas
+ */
+void
+GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn)
+{
+	XLogRecPtr min_write_lsn = UnknownXLogRecPtr;
+	XLogRecPtr min_flush_lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < max_wal_senders; i++)
+	{
+		WalSnd	   *walsnd = &WalSndCtl->walsnds[i];
+		if (walsnd->state == WALSNDSTATE_STREAMING)
+		{
+			/*
+			 * We assume that reads from walsnd->write/flush are atomic
+			 * on all modern x64 systems, as these fields are uint64 and
+			 * should be 8-bytes aligned.
+			 */
+			XLogRecPtr written = walsnd->write;
+			XLogRecPtr flushed = walsnd->flush;
+			min_write_lsn = Min(written, min_write_lsn);
+			min_flush_lsn = Min(flushed, min_flush_lsn);
+		}
+	}
+	*write_lsn = min_write_lsn;
+	*flush_lsn = min_flush_lsn;
+}
+
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6168b069b26..fadcc1b6255 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -55,7 +55,6 @@
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
-
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 1a30faf8ad4..1060ddf3037 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -491,6 +491,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w)
 		case WAIT_EVENT_VACUUM_DELAY:
 			event_name = "VacuumDelay";
 			break;
+		case WAIT_EVENT_BACK_PRESSURE:
+			event_name = "BackPressure";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6a52c2f56d8..5271a9c594f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2912,6 +2912,30 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal write lag between master and replicas."),
+			gettext_noop("When lag between minimal write position of replica and current LSN exeeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_write_lag,
+		1024, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
+	{
+		{"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal flush lag between master and replicas."),
+			gettext_noop("When lag between minimal flush position of replica and current LSN exeeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_flush_lag,
+		1, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."),
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index 0940b64ca6b..d44d5e64cdc 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -28,6 +28,14 @@ typedef uint64 XLogRecPtr;
 #define InvalidXLogRecPtr	0
 #define XLogRecPtrIsInvalid(r)	((r) == InvalidXLogRecPtr)
 
+/*
+ * Maximum possible XLogRecPtr value.  Currently used by back pressure
+ * mechanism to distinguish the unknown replica flush/write position.
+ * This significantly simplifies comparison and checks as we always
+ * look for the minimal value.
+ */
+#define UnknownXLogRecPtr	((XLogRecPtr) ~0)
+
 /*
  * First LSN to use for "fake" LSNs.
  *
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index f1d8c39edf1..699ca56ed25 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -38,6 +38,9 @@
 #define REGBUF_KEEP_DATA	0x10	/* include data even if a full-page image
 									 * is taken */
 
+extern int max_replication_write_lag;
+extern int max_replication_flush_lag;
+
 /* prototypes for public functions in xloginsert.c: */
 extern void XLogBeginInsert(void);
 extern void XLogSetRecordFlags(uint8 flags);
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index c6ece7a8ec7..2e32e0f0f7c 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -294,6 +294,8 @@ typedef struct AppendResponse
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
 	XLogRecPtr commitLsn;
+	// Part of WALL applied and written to the disk by all pageservers
+	XLogRecPtr diskConsistentLsn;
 	HotStandbyFeedback hs;
 } AppendResponse;
 
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 828106933ca..bd2f9ad6d28 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -12,6 +12,7 @@
 #ifndef _WALSENDER_H
 #define _WALSENDER_H
 
+#include "access/xlog.h"
 #include <signal.h>
 
 /*
@@ -47,7 +48,7 @@ extern void WalSndInitStopping(void);
 extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
-
+extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush);
 /*
  * Remember that we want to wakeup walsenders later
  *
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 4b1cea65938..c28cf949f9e 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -142,7 +142,8 @@ typedef enum
 	WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL,
 	WAIT_EVENT_VACUUM_DELAY,
 	WAIT_EVENT_CHECKPOINT_WRITE_DELAY,
-	WAIT_EVENT_REGISTER_SYNC_REQUEST
+	WAIT_EVENT_REGISTER_SYNC_REQUEST,
+	WAIT_EVENT_BACK_PRESSURE
 } WaitEventTimeout;
 
 /* ----------

From 92f0a928f5205886777d3c63af62893069102df7 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 19 Oct 2021 19:12:38 +0300
Subject: [PATCH 069/214] Initialize FSM/VM pages through buffer cache

To prevent loading them from pageserver.

Author: Konstantin Knizhnik with my extension to VM as well.
---
 src/backend/access/heap/visibilitymap.c   | 17 +++++++++++++++--
 src/backend/storage/freespace/freespace.c | 16 ++++++++++++++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 4720b35ee5c..2e11f3fc0e4 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -654,9 +654,22 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	/* Now extend the file */
 	while (vm_nblocks_now < vm_nblocks)
 	{
-		PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
+		/*
+		 * NEON: Initialize VM pages through buffer cache to prevent loading
+		 * them from pageserver.
+		 */
+		Buffer	buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
+											RBM_ZERO_AND_LOCK, NULL);
+		Page	page = BufferGetPage(buffer);
+
+		PageInit((Page) page, BLCKSZ, 0);
+		PageSetChecksumInplace(page, vm_nblocks_now);
+		MarkBufferDirty(buffer);
+
+		smgrextend(reln, VISIBILITYMAP_FORKNUM, vm_nblocks_now, page, false);
+
+		UnlockReleaseBuffer(buffer);
 
-		smgrextend(reln, VISIBILITYMAP_FORKNUM, vm_nblocks_now, pg.data, false);
 		vm_nblocks_now++;
 	}
 
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 09d4b16067d..4e0e20403c0 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -647,10 +647,22 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	/* Extend as needed. */
 	while (fsm_nblocks_now < fsm_nblocks)
 	{
-		PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now);
+		/*
+		 * NEON: Initialize FSM pages through buffer cache to prevent loading
+		 * them from pageserver.
+		 */
+		Buffer	buffer = ReadBufferExtended(rel, FSM_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL);
+		Page	page = BufferGetPage(buffer);
+
+		PageInit((Page) page, BLCKSZ, 0);
+		PageSetChecksumInplace(page, fsm_nblocks_now);
+		MarkBufferDirty(buffer);
 
 		smgrextend(reln, FSM_FORKNUM, fsm_nblocks_now,
-				   pg.data, false);
+				   page, false);
+
+		UnlockReleaseBuffer(buffer);
+
 		fsm_nblocks_now++;
 	}
 

From 48e45e303028c65d303e2d53ad42c3fc3cf18999 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 21 Oct 2021 23:16:00 +0300
Subject: [PATCH 070/214] Turn off back pressure by default

---
 src/backend/access/transam/xloginsert.c | 3 ++-
 src/backend/utils/misc/guc.c            | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 935298ac810..5f3e1ff9c9e 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -121,9 +121,10 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
-/* Timeout in milliseconds for delaying backend WAL inserts to avoid WAL overflow */
+/* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */
 #define BACK_PRESSURE_TIMEOUT 100
 #define MB ((XLogRecPtr)1024*1024)
+
 /*
  * Begin constructing a WAL record. This must be called before the
  * XLogRegister* functions and XLogInsert().
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5271a9c594f..9cb96b4a950 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2915,24 +2915,24 @@ static struct config_int ConfigureNamesInt[] =
 	{
 		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Maximal write lag between master and replicas."),
-			gettext_noop("When lag between minimal write position of replica and current LSN exeeds this value,"
+			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
 						 "backends are blocked"),
 			GUC_UNIT_MB,
 		},
 		&max_replication_write_lag,
-		1024, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 
 	{
 		{"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Maximal flush lag between master and replicas."),
-			gettext_noop("When lag between minimal flush position of replica and current LSN exeeds this value,"
+			gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value,"
 						 "backends are blocked"),
 			GUC_UNIT_MB,
 		},
 		&max_replication_flush_lag,
-		1, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 

From 73c2bf228ea2a165fb995686266e38994cb07754 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 21 Oct 2021 15:59:55 +0300
Subject: [PATCH 071/214] ShutdownConnection instead of ResetConnection in more
 places.

At least currently risk of busy loop (e.g due to bugs) is much higher than
benefit of additional availability if we immediately reconnect; add interval
between the reconnection attempts.
---
 src/backend/replication/walproposer.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 53c7b06931a..667d18ba56e 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1213,7 +1213,7 @@ AdvancePollState(int i, uint32 events)
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
 						 wk->host, wk->port, walprop_error_message(wk->conn));
-					ResetConnection(i);
+					ShutdownConnection(i);
 					return;
 				}
 
@@ -1252,7 +1252,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_FAILED:
 						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
 							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ResetConnection(i);
+						ShutdownConnection(i);
 						return;
 
 						/*
@@ -1263,7 +1263,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_UNEXPECTED_SUCCESS:
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
 							 wk->host, wk->port);
-						ResetConnection(i);
+						ShutdownConnection(i);
 						return;
 				}
 				break;
@@ -1662,7 +1662,7 @@ AsyncRead(int i, void *value, size_t value_size)
 				 wk->host, wk->port,
 				 FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ResetConnection(i);
+			ShutdownConnection(i);
 			return false;
 	}
 
@@ -1707,7 +1707,7 @@ BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 			 wk->host, wk->port, FormatWalKeeperState(wk->state),
 			 walprop_error_message(wk->conn));
-		ResetConnection(i);
+		ShutdownConnection(i);
 		return false;
 	}
 
@@ -1756,7 +1756,7 @@ AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKee
 			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 				 wk->host, wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ResetConnection(i);
+			ShutdownConnection(i);
 			return false;
 	}
 

From 6d3e7e8fcf2605a027f488c433c0413e47d620e3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 3 Nov 2021 19:55:40 +0200
Subject: [PATCH 072/214] Handle partial writes to stdout in WAL redo process.

---
 src/backend/tcop/zenith_wal_redo.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 9e1620922ce..0ddd2ddec24 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -660,6 +660,7 @@ GetPage(StringInfo input_message)
 	BlockNumber blknum;
 	Buffer		buf;
 	Page		page;
+	int			tot_written;
 
 	/*
 	 * message format:
@@ -683,7 +684,21 @@ GetPage(StringInfo input_message)
 	/* single thread, so don't bother locking the page */
 
 	/* Response: Page content */
-	write(STDOUT_FILENO, page, BLCKSZ); /* FIXME: check errors */
+	tot_written = 0;
+	do {
+		ssize_t		rc;
+
+		rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written);
+		if (rc < 0) {
+			/* If interrupted by signal, just retry */
+			if (errno == EINTR)
+				continue;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to stdout: %m")));
+		}
+		tot_written += rc;
+	} while (tot_written < BLCKSZ);
 
 	ReleaseBuffer(buf);
 	DropDatabaseBuffers(rnode.dbNode);

From d4fcfd999a6a369d62c47015be6e7e140dc183a0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 8 Nov 2021 15:59:23 +0300
Subject: [PATCH 073/214] Handle keepalives while receiving WAL in recovery.

Since c310932 safekeeper sometimes sends it.

ref #843
---
 src/backend/replication/walproposer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 667d18ba56e..618c8992f57 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -978,7 +978,9 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 			}
 			else
 			{
-				Assert(buf[0] == 'w');
+				Assert(buf[0] == 'w' || buf[0] == 'k');
+				if (buf[0] == 'k')
+					continue; /* keepalive */
 				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
 					   sizeof rec_start_lsn);
 				rec_start_lsn = pg_ntoh64(rec_start_lsn);

From ecfbad25429e104abfaf6144a0afc88e70b9340b Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 9 Nov 2021 23:45:50 +0300
Subject: [PATCH 074/214] Fix truncateLsn update (#101)

truncateLsn is now advanced to `Min(walkeeper[i].feedback.flushLsn)` with taking epochs into account.
---
 src/backend/replication/walproposer.c | 85 ++++++++++++---------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 618c8992f57..d99f9025b0c 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -81,7 +81,6 @@ static AppendResponse lastFeedback;
  *  record-aligned (first record which might not yet received by someone).
  */
 static XLogRecPtr truncateLsn;
-static XLogRecPtr candidateTruncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t propTerm;			/* term of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
@@ -150,6 +149,26 @@ CalculateDiskConsistentLsn(void)
 	return lsn;
 }
 
+/*
+ * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
+ * last WAL record that can be safely discarded.
+ */
+static XLogRecPtr
+CalculateMinFlushLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		/* We can't rely on safekeeper flushLsn if it has wrong epoch */
+		if (walkeeper[i].feedback.epoch != propTerm)
+			return 0;
+
+		if (walkeeper[i].feedback.flushLsn < lsn)
+			lsn = walkeeper[i].feedback.flushLsn;
+	}
+	return lsn;
+}
+
 /* Initializes the internal event set, provided that it is currently null */
 static void
 InitEventSet(void)
@@ -357,7 +376,7 @@ HandleWalKeeperResponse(void)
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
 	XLogRecPtr	diskConsistentLsn;
-	WalMessage *msgQueueAck;
+	XLogRecPtr  minFlushLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	diskConsistentLsn = CalculateDiskConsistentLsn();
@@ -389,39 +408,24 @@ HandleWalKeeperResponse(void)
 									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 
-	/* Advance truncateLsn */
-	msgQueueAck = msgQueueHead;
-	while (msgQueueAck != NULL && msgQueueAck->ackMask == ((1 << n_walkeepers) - 1))
-	{
-		/*
-		 * This piece is received by everyone; try to advance truncateLsn, but
-		 * hold it back to nearest commitLsn. Thus we will always start
-		 * streaming from the beginning of the record, which simplifies
-		 * decoding on the far end.
-		 *
-		 * This also prevents surprising violation of truncateLsn <= commitLsn
-		 * invariant which might occur because 1) truncateLsn can be advanced
-		 * immediately once chunk is broadcast to all safekeepers, and
-		 * commitLsn generally can't be advanced based on feedback from
-		 * safekeeper who is still in the previous epoch (similar to 'leader
-		 * can't commit entries from previous term' in Raft); 2) chunks we
-		 * read from WAL and send are plain sheets of bytes, but safekeepers
-		 * ack only on commit boundaries.
-		 */
-		if (msgQueueAck->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
-		{
-			truncateLsn = minQuorumLsn;
-			candidateTruncateLsn = InvalidXLogRecPtr;
-		}
-		else if (msgQueueAck->req.endLsn >= candidateTruncateLsn &&
-				 candidateTruncateLsn != InvalidXLogRecPtr)
-		{
-			truncateLsn = candidateTruncateLsn;
-			candidateTruncateLsn = InvalidXLogRecPtr;
-		}
-
-		msgQueueAck = msgQueueAck->next;
-	}
+	/*
+	 * Try to advance truncateLsn to minFlushLsn, which is the last record
+	 * flushed to all safekeepers. We must always start streaming from the 
+	 * beginning of the record, which simplifies decoding on the far end.
+	 *
+	 * Advanced truncateLsn should be not further than nearest commitLsn.
+	 * This prevents surprising violation of truncateLsn <= commitLsn
+	 * invariant which might occur because 1) truncateLsn can be advanced
+	 * immediately once chunk is broadcast to all safekeepers, and
+	 * commitLsn generally can't be advanced based on feedback from
+	 * safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2) chunks we
+	 * read from WAL and send are plain sheets of bytes, but safekeepers
+	 * ack only on record boundaries.
+	 */
+	minFlushLsn = CalculateMinFlushLsn();
+	if (minFlushLsn > truncateLsn)
+		truncateLsn = minFlushLsn;
 
 	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
 	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
@@ -1604,17 +1608,6 @@ AdvancePollState(int i, uint32 events)
 					if (minQuorumLsn > lastSentCommitLsn)
 					{
 						BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-
-						/*
-						 * commitLsn is always the record boundary; remember
-						 * it so we can advance truncateLsn there. But do so
-						 * only if previous value is applied, otherwise it
-						 * might never catch up.
-						 */
-						if (candidateTruncateLsn == InvalidXLogRecPtr)
-						{
-							candidateTruncateLsn = minQuorumLsn;
-						}
 						lastSentCommitLsn = minQuorumLsn;
 					}
 					break;

From 450e3fb2729350a941637b359bbb1482b8aaa6fa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 10 Nov 2021 18:48:01 +0300
Subject: [PATCH 075/214] [walproposer] Get rid of SAB_Error after rebase

Also see 1632ea43 for details.
---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d99f9025b0c..5512df16664 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -870,7 +870,7 @@ DetermineEpochStartLsn(void)
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
-		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, SAB_Error);
+		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
 		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
 		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));

From 1a99944d11ca5040d5b529df5b376ee8a1efd4f5 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 5 Nov 2021 13:49:50 +0300
Subject: [PATCH 076/214] Add term history to safekeepers.

See corresponding zenith commit.
---
 src/backend/replication/walproposer.c       | 531 +++++++++++++-------
 src/backend/replication/walproposer_utils.c |  51 ++
 src/include/replication/walproposer.h       |  57 ++-
 3 files changed, 456 insertions(+), 183 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5512df16664..3defed7f9ab 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -43,6 +43,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "access/xlog.h"
+#include "libpq/pqformat.h"
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "postmaster/bgworker.h"
@@ -82,6 +83,7 @@ static AppendResponse lastFeedback;
  */
 static XLogRecPtr truncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
+static TermHistory propTermHistory; /* term history of the proposer */
 static term_t propTerm;			/* term of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
 static term_t donorEpoch;		/* Most advanced acceptor epoch */
@@ -95,13 +97,19 @@ static bool syncSafekeepers;
 
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
-static bool AsyncRead(int i, void *value, size_t value_size);
+static bool AsyncRead(int i, char **buf, int *buf_size);
+static bool AsyncReadFixed(int i, void *value, size_t value_size);
+static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
-static bool AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
-static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
-static void HackyRemoveWalProposerEvent(int to_remove);
-static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
+static bool AsyncFlush(int i, bool socket_read_ready);
+static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
 static void BroadcastMessage(WalMessage *msg);
+static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static term_t GetHighestTerm(TermHistory *th);
+static term_t GetEpoch(WalKeeper *wk);
+static void SendProposerElected(WalKeeper *wk);
+static void StartStreaming(WalKeeper *wk);
 
 
 /*
@@ -159,10 +167,6 @@ CalculateMinFlushLsn(void)
 	XLogRecPtr lsn = UnknownXLogRecPtr;
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		/* We can't rely on safekeeper flushLsn if it has wrong epoch */
-		if (walkeeper[i].feedback.epoch != propTerm)
-			return 0;
-
 		if (walkeeper[i].feedback.flushLsn < lsn)
 			lsn = walkeeper[i].feedback.flushLsn;
 	}
@@ -205,7 +209,7 @@ UpdateEventSet(WalKeeper *wk, uint32 events)
  * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
  */
 static void
-HackyRemoveWalProposerEvent(int to_remove)
+HackyRemoveWalProposerEvent(WalKeeper *to_remove)
 {
 	/* Remove the existing event set */
 	if (waitEvents)
@@ -228,7 +232,7 @@ HackyRemoveWalProposerEvent(int to_remove)
 
 		wk->eventPos = -1;
 
-		if (i == to_remove)
+		if (wk == to_remove)
 			continue;
 
 		/* If this WAL keeper isn't offline, add an event for it! */
@@ -241,15 +245,18 @@ HackyRemoveWalProposerEvent(int to_remove)
 
 /* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
 static void
-ShutdownConnection(int i)
+ShutdownConnection(WalKeeper *wk)
 {
-	if (walkeeper[i].conn)
-		walprop_finish(walkeeper[i].conn);
-	walkeeper[i].conn = NULL;
-	walkeeper[i].state = SS_OFFLINE;
-	walkeeper[i].currMsg = NULL;
-
-	HackyRemoveWalProposerEvent(i);
+	if (wk->conn)
+		walprop_finish(wk->conn);
+	wk->conn = NULL;
+	wk->state = SS_OFFLINE;
+	wk->currMsg = NULL;
+	if (wk->voteResponse.termHistory.entries)
+		pfree(wk->voteResponse.termHistory.entries);
+	wk->voteResponse.termHistory.entries = NULL;
+
+	HackyRemoveWalProposerEvent(wk);
 }
 
 /*
@@ -259,14 +266,13 @@ ShutdownConnection(int i)
  * On success, sets the state to SS_CONNECTING_WRITE.
  */
 static void
-ResetConnection(int i)
+ResetConnection(WalKeeper *wk)
 {
 	pgsocket	sock;			/* socket of the new connection */
-	WalKeeper  *wk = &walkeeper[i];
 
 	if (wk->state != SS_OFFLINE)
 	{
-		ShutdownConnection(i);
+		ShutdownConnection(wk);
 	}
 
 	/*
@@ -354,13 +360,11 @@ GetAcknowledgedByQuorumWALPosition(void)
 	for (int i = 0; i < n_walkeepers; i++)
 	{
 		/*
-		 * Note that while we haven't pushed WAL up to epoch start lsn to the
-		 * majority we don't really know which LSN is reliably committed as
-		 * reported flush_lsn is physical end of wal, which can contain
-		 * diverged history (compared to donor).
+		 * Like in Raft, we aren't allowed to commit entries from previous
+		 * terms, so ignore reported LSN until it gets to epochStartLsn.
 		 */
-		responses[i] = walkeeper[i].feedback.epoch == propTerm
-			? walkeeper[i].feedback.flushLsn : 0;
+		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
+			walkeeper[i].feedback.flushLsn : 0;
 	}
 	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
 
@@ -529,6 +533,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 * `ResetConnection` as needed
 		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
+		initStringInfo(&walkeeper[n_walkeepers].outbuf);
 		walkeeper[n_walkeepers].currMsg = NULL;
 		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_walkeepers += 1;
@@ -575,7 +580,7 @@ WalProposerStart(void)
 	/* Initiate connections to all walkeeper nodes */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		ResetConnection(i);
+		ResetConnection(&walkeeper[i]);
 	}
 
 	WalProposerLoop();
@@ -831,17 +836,33 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	return msg;
 }
 
+/* latest term in TermHistory, or 0 is there is no entries */
+static term_t
+GetHighestTerm(TermHistory *th)
+{
+	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
+}
+
+/* safekeeper's epoch is the term of the highest entry in the log */
+static term_t
+GetEpoch(WalKeeper *wk)
+{
+	return GetHighestTerm(&wk->voteResponse.termHistory);
+}
 
 /*
  * Called after majority of acceptors gave votes, it calculates the most
  * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
  * which we'll write WAL in our term.
- * Sets truncateLsn along the way (though it
- * is not of much use at this point).
+ *
+ * Sets truncateLsn along the way (though it is not of much use at this point --
+ * only for skipping recovery).
  */
 static void
 DetermineEpochStartLsn(void)
 {
+	TermHistory *dth;
+
 	propEpochStartLsn = InvalidXLogRecPtr;
 	donorEpoch = 0;
 	truncateLsn = InvalidXLogRecPtr;
@@ -850,11 +871,11 @@ DetermineEpochStartLsn(void)
 	{
 		if (walkeeper[i].state == SS_IDLE)
 		{
-			if (walkeeper[i].voteResponse.epoch > donorEpoch ||
-				(walkeeper[i].voteResponse.epoch == donorEpoch &&
+			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
+				(GetEpoch(&walkeeper[i]) == donorEpoch &&
 				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
 			{
-				donorEpoch = walkeeper[i].voteResponse.epoch;
+				donorEpoch = GetEpoch(&walkeeper[i]);
 				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
 				donor = i;
 			}
@@ -884,6 +905,16 @@ DetermineEpochStartLsn(void)
 	Assert((truncateLsn != InvalidXLogRecPtr) ||
 		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
 
+	/*
+	 * Proposer's term history is the donor's + its own entry.
+	 */
+	dth = &walkeeper[donor].voteResponse.termHistory;
+	propTermHistory.n_entries = dth->n_entries + 1;
+	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
+	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
+	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
+
 	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		 quorum,
 		 propTerm,
@@ -926,7 +957,7 @@ ReconnectWalKeepers(void)
 		for (int i = 0; i < n_walkeepers; i++)
 		{
 			if (walkeeper[i].state == SS_OFFLINE)
-				ResetConnection(i);
+				ResetConnection(&walkeeper[i]);
 		}
 	}
 }
@@ -1008,44 +1039,129 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 		return false;
 	}
 
-	/*
-	 * Start sending entries to everyone from the beginning (truncateLsn),
-	 * except for those who lives in donor's epoch and thus for sure has
-	 * correct WAL. We could do here even slightly better, taking into account
-	 * commitLsn of the rest to avoid sending them excessive data.
+	return true;
+}
+
+/*
+ * Determine for wk the starting streaming point and send it message
+ * 1) Announcing we are elected proposer (which immediately advances epoch if
+ *    safekeeper is synced, being important for sync-safekeepers)
+ * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
+ *    beyond it -- and history of term switching.
+ * 
+ * Sets wk->startStreamingAt.
+ */
+static void
+SendProposerElected(WalKeeper *wk)
+{
+	ProposerElected msg;
+	TermHistory *th;
+	term_t lastCommonTerm;
+	int i;
+
+	/* 
+	 * Determine start LSN by comparing safekeeper's log term switch history and
+	 * proposer's, searching for the divergence point.
+	 *
+	 * Note: there is a vanishingly small chance of no common point even if
+	 * there is some WAL on safekeeper, if immediately after bootstrap compute
+	 * wrote some WAL on single sk and died; we stream since the beginning then.
 	 */
-	for (int i = 0; i < n_walkeepers; i++)
+	th = &wk->voteResponse.termHistory;
+	/* 
+	 * If any WAL is present on the sk, it must be authorized by some term.
+	 * OTOH, without any WAL there are no term swiches in the log.
+	 */
+	Assert((th->n_entries == 0) ==
+		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
+	/* We must start somewhere. */
+	Assert(propTermHistory.n_entries >= 1);
+
+	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
 	{
-		if (walkeeper[i].state != SS_IDLE)
-			continue;
+		if (propTermHistory.entries[i].term != th->entries[i].term)
+			break;
+		/* term must begin everywhere at the same point */
+		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
+	}
+	i--; /* step back to the last common term */
+	if (i < 0)
+	{
+		/* safekeeper is empty or no common point, start from the beginning */
+		wk->startStreamingAt = propTermHistory.entries[0].lsn;
+	}
+	else
+	{
+		/*
+		 * End of (common) term is the start of the next except it is the last
+		 * one; there it is flush_lsn in case of safekeeper or, in case of
+		 * proposer, LSN it is currently writing, but then we just pick
+		 * safekeeper pos as it obviously can't be higher.
+		 */
+		if (propTermHistory.entries[i].term == propTerm)
+		{
+			wk->startStreamingAt = wk->voteResponse.flushLsn;
+		}
+		else
+		{
+			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
+			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
+														   wk->voteResponse.flushLsn);
+			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
+		}
+	}
+
+	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+
+	msg.tag = 'e';
+	msg.term = propTerm;
+	msg.startStreamingAt = wk->startStreamingAt;
+	msg.termHistory = &propTermHistory;
+
+	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
+	elog(LOG,
+		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
+		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
+	
+	resetStringInfo(&wk->outbuf);
+	pq_sendint64_le(&wk->outbuf, msg.tag);
+	pq_sendint64_le(&wk->outbuf, msg.term);
+	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
+	for (int i = 0; i < msg.termHistory->n_entries; i++)
+	{
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
+	}
+
+	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+		return;
+
+	StartStreaming(wk);
+}
 
-		if (walkeeper[i].voteResponse.epoch != donorEpoch)
+/*
+ * Start streaming to safekeeper wk.
+ */
+static void
+StartStreaming(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+
+	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
+	{
+		if (msg->req.endLsn <= wk->startStreamingAt)
 		{
-			SendMessageToNode(i, msgQueueHead);
+			/* message is already received by this walkeeper */
+			msg->ackMask |= 1 << wki;
 		}
 		else
 		{
-			for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-			{
-				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
-				{
-					/* message is already received by this walkeeper */
-					msg->ackMask |= 1 << i;
-				}
-				else
-				{
-					/*
-					 * By convention we always stream since the beginning of
-					 * the record, and flushLsn points to it.
-					 */
-					walkeeper[i].startStreamingAt = walkeeper[i].voteResponse.flushLsn;
-					SendMessageToNode(i, msg);
-					break;
-				}
-			}
+			SendMessageToNode(wki, msg);
+			return;
 		}
 	}
-	return true;
+	wk->state = SS_IDLE; /* nothing to send yet, safekeeper is recovered */
 }
 
 /*
@@ -1196,7 +1312,7 @@ AdvancePollState(int i, uint32 events)
 							 * restart at a slower interval on calls to
 							 * ReconnectWalKeepers.
 							 */
-							ShutdownConnection(i);
+							ShutdownConnection(wk);
 							return;
 					}
 
@@ -1205,7 +1321,7 @@ AdvancePollState(int i, uint32 events)
 					 * un-register the old event and re-register an event on
 					 * the new socket.
 					 */
-					HackyRemoveWalProposerEvent(i);
+					HackyRemoveWalProposerEvent(wk);
 					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
 					break;
 				}
@@ -1219,7 +1335,7 @@ AdvancePollState(int i, uint32 events)
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
 						 wk->host, wk->port, walprop_error_message(wk->conn));
-					ShutdownConnection(i);
+					ShutdownConnection(wk);
 					return;
 				}
 
@@ -1258,7 +1374,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_FAILED:
 						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
 							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(i);
+						ShutdownConnection(wk);
 						return;
 
 						/*
@@ -1269,7 +1385,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_UNEXPECTED_SUCCESS:
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
 							 wk->host, wk->port);
-						ShutdownConnection(i);
+						ShutdownConnection(wk);
 						return;
 				}
 				break;
@@ -1301,7 +1417,7 @@ AdvancePollState(int i, uint32 events)
 				 * error handling or state setting is taken care of. We can
 				 * leave any other work until later.
 				 */
-				if (!AsyncRead(i, &wk->greet, sizeof(wk->greet)))
+				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
 				/* Protocol is all good, move to voting. */
@@ -1381,11 +1497,12 @@ AdvancePollState(int i, uint32 events)
 			case SS_VOTING:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(i);
+				ResetConnection(wk);
 				break;
 
 				/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
+				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
 				/* On failure, logging & resetting is handled */
 				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
 					return;
@@ -1395,18 +1512,13 @@ AdvancePollState(int i, uint32 events)
 
 				/* Start reading the walkeeper response for our candidate */
 			case SS_WAIT_VERDICT:
-
-				/*
-				 * If our reading doesn't immediately succeed, any necessary
-				 * error handling or state setting is taken care of. We can
-				 * leave any other work until later.
-				 */
-				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
+				wk->voteResponse.apm.tag = 'v';
+				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
 					return;
 
 				elog(LOG,
 					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-					 wk->host, wk->port, wk->voteResponse.voteGiven, wk->voteResponse.epoch,
+					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
 					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
 					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
 
@@ -1426,18 +1538,16 @@ AdvancePollState(int i, uint32 events)
 				Assert(wk->voteResponse.term == propTerm);
 
 				/* Handshake completed, do we have quorum? */
-
-				if (++n_votes != quorum)
+				n_votes++;
+				if (n_votes < quorum)
+				{
+					wk->state = SS_IDLE; /* can't do much yet, no quorum */
+				}
+				else if (n_votes > quorum)
 				{
-					/* Can't start streaming earlier than truncateLsn */
-					wk->startStreamingAt = truncateLsn;
-					Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
 
-					/*
-					 * We are already streaming WAL: send all pending messages
-					 * to the attached walkeeper
-					 */
-					SendMessageToNode(i, msgQueueHead);
+					/* recovery already performed, just start streaming */
+					SendProposerElected(wk);
 				}
 				else
 				{
@@ -1461,20 +1571,6 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
-
-						/*
-						 * This message signifies epoch switch; it is needed
-						 * to make the switch happen on donor, as he won't get
-						 * any other messages until we start writing new WAL
-						 * (and we e.g. don't in --sync mode at all)
-						 */
-						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
-
-						if (syncSafekeepers)
-						{
-							/* keep polling until all walkeepers are synced */
-							return;
-						}
 					}
 					else if (syncSafekeepers)
 					{
@@ -1482,12 +1578,50 @@ AdvancePollState(int i, uint32 events)
 						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
 						exit(0);
 					}
+
+					for (int i = 0; i < n_walkeepers; i++)
+					{
+						if (walkeeper[i].state == SS_IDLE)
+							SendProposerElected(&walkeeper[i]);
+					}
+
+					if (syncSafekeepers)
+					{
+						/*
+						 * Queue empty message to enforce receiving feedback
+						 * even from nodes who are fully recovered; this is
+						 * required to learn they switched epoch which finishes
+						 * sync-safeekepers who doesn't generate any real new
+						 * records. Will go away once we switch to async acks.
+						 */
+						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+
+						/* keep polling until all walkeepers are synced */
+						return;
+					}
+
 					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
 				}
 
 				break;
 
+			/* Flush proposer announcement message */
+			case SS_SEND_ELECTED_FLUSH:
+
+				/*
+				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll
+				 * wait until the next poll comes along.
+				 */
+				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
+					return;
+				
+				StartStreaming(wk);
+
+				break;
+
+
 				/*
 				 * Idle state for sending WAL. Moved out only by calls to
 				 * SendMessageToNode
@@ -1495,7 +1629,7 @@ AdvancePollState(int i, uint32 events)
 			case SS_IDLE:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(i);
+				ResetConnection(wk);
 				break;
 
 				/*
@@ -1543,15 +1677,16 @@ AdvancePollState(int i, uint32 events)
 					 * message is stored after the end of the WalMessage
 					 * struct, in the allocation for each msg
 					 */
-					if (!AsyncWrite(i, req,
+					if (!AsyncWrite(wk, req,
 									sizeof(AppendRequestHeader) + req->endLsn -
 									req->beginLsn,
-									SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+									SS_SEND_WAL_FLUSH))
 					{
 						if (req != &msg->req)
 							free(req);
 						return;
 					}
+					wk->state = SS_RECV_FEEDBACK;
 					if (req != &msg->req)
 						free(req);
 
@@ -1566,9 +1701,11 @@ AdvancePollState(int i, uint32 events)
 				 * the flush completes. If we still have more to do, we'll
 				 * wait until the next poll comes along.
 				 */
-				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0, SS_RECV_FEEDBACK))
+				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
 					return;
 
+				wk->state = SS_RECV_FEEDBACK;
+
 				break;
 
 				/*
@@ -1585,7 +1722,7 @@ AdvancePollState(int i, uint32 events)
 					 * necessary error handling or state setting is taken care
 					 * of. We can leave any other work until later.
 					 */
-					if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
+					if (!AsyncReadFixed(i, &wk->feedback, sizeof(wk->feedback)))
 						return;
 
 					next = wk->currMsg->next;
@@ -1622,44 +1759,52 @@ AdvancePollState(int i, uint32 events)
 	}
 }
 
-/*
- * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
- * returning whether the read was successful.
- *
- * If the read needs more polling, we return 'false' and keep the state
- * unmodified, waiting until it becomes read-ready to try again. If it fully
- * failed, a warning is emitted and the connection is reset.
+/* 
+ * Try to read CopyData message from i'th safekeeper, resetting connection on
+ * failure.
  */
 static bool
-AsyncRead(int i, void *value, size_t value_size)
+AsyncRead(int i, char **buf, int *buf_size)
 {
 	WalKeeper  *wk = &walkeeper[i];
-	char	   *buf = NULL;
-	int			buf_size = -1;
-	uint32		events;
 
-	switch (walprop_async_read(wk->conn, &buf, &buf_size))
+	switch (walprop_async_read(wk->conn, buf, buf_size))
 	{
-			/* On success, there's just a couple more things we'll check below */
 		case PG_ASYNC_READ_SUCCESS:
-			break;
+			return true;
 
-			/*
-			 * If we need more input, wait until the socket is read-ready and
-			 * try again.
-			 */
 		case PG_ASYNC_READ_TRY_AGAIN:
-			UpdateEventSet(wk, WL_SOCKET_READABLE);
+			/* WL_SOCKET_READABLE is always set during copyboth */
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			elog(WARNING, "Failed to read from node %s:%s in %s state: %s",
-				 wk->host, wk->port,
-				 FormatWalKeeperState(wk->state),
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", wk->host,
+				 wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ShutdownConnection(i);
+			ShutdownConnection(wk);
 			return false;
 	}
+	Assert(false);
+	return false;
+}
+
+/*
+ * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
+ * returning whether the read was successful.
+ *
+ * If the read needs more polling, we return 'false' and keep the state
+ * unmodified, waiting until it becomes read-ready to try again. If it fully
+ * failed, a warning is emitted and the connection is reset.
+ */
+static bool
+AsyncReadFixed(int i, void *value, size_t value_size)
+{
+	WalKeeper  *wk = &walkeeper[i];
+	char	   *buf = NULL;
+	int			buf_size = -1;
+
+	if (!(AsyncRead(i, &buf, &buf_size)))
+		return false;
 
 	/*
 	 * If we get here, the read was ok, but we still need to check it was the
@@ -1677,14 +1822,68 @@ AsyncRead(int i, void *value, size_t value_size)
 	/* Copy the resulting info into place */
 	memcpy(value, buf, buf_size);
 
-	/* Update the events for the WalKeeper, if it's going to wait */
-	events = WalKeeperStateDesiredEvents(wk->state);
-	if (events)
-		UpdateEventSet(wk, events);
-
 	return true;
 }
 
+/*
+ * Read next message with known type into provided struct. 
+ * TODO: migrate AsyncReadFixed here for all messages
+ */
+static bool
+AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
+{
+	WalKeeper  *wk = &walkeeper[i];
+	char *buf;
+	int buf_size;
+	uint64 tag;
+	StringInfoData s;
+
+	if (!(AsyncRead(i, &buf, &buf_size)))
+		return false;
+
+	/* parse it */
+	s.data = buf;
+	s.len = buf_size;
+	s.cursor = 0;
+
+	tag = pq_getmsgint64_le(&s);
+	if (tag != anymsg->tag)
+	{
+		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, wk->host,
+			 wk->port, FormatWalKeeperState(wk->state));
+		ResetConnection(wk);
+		return false;
+	}
+
+	switch (tag)
+	{
+		case 'v':
+		{
+			VoteResponse *msg = (VoteResponse *) anymsg;
+
+			msg->term = pq_getmsgint64_le(&s);
+			msg->voteGiven = pq_getmsgint64_le(&s);
+			msg->flushLsn = pq_getmsgint64_le(&s);
+			msg->truncateLsn = pq_getmsgint64_le(&s);
+			msg->termHistory.n_entries = pq_getmsgint32_le(&s);
+			msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+			for (int i = 0; i < msg->termHistory.n_entries; i++)
+			{
+				msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
+				msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+			}
+			pq_getmsgend(&s);
+			return true;
+		}
+
+		default:
+		{
+			Assert(false);
+			return false;
+		}
+	}
+}
+
 /*
  * Blocking equivalent to AsyncWrite.
  *
@@ -1702,7 +1901,7 @@ BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 			 wk->host, wk->port, FormatWalKeeperState(wk->state),
 			 walprop_error_message(wk->conn));
-		ShutdownConnection(i);
+		ShutdownConnection(wk);
 		return false;
 	}
 
@@ -1721,23 +1920,18 @@ BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 
 /*
  * Starts a write into the 'i'th WAL keeper's postgres connection, moving to
- * success_state only when the write succeeds. If the write needs flushing,
- * moves to flush_state.
+ * flush_state (adjusting eventset) if write still needs flushing.
  *
- * Returns false only if the write immediately fails. Upon failure, a warning is
- * emitted and the connection is reset.
+ * Returns false if sending is unfinished (requires flushing or conn failed).
+ * Upon failure, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
+AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state)
 {
-	WalKeeper  *wk = &walkeeper[i];
-	uint32		events;
-
 	switch (walprop_async_write(wk->conn, msg, msg_size))
 	{
 		case PG_ASYNC_WRITE_SUCCESS:
-			wk->state = success_state;
-			break;
+			return true;
 		case PG_ASYNC_WRITE_TRY_FLUSH:
 
 			/*
@@ -1746,37 +1940,30 @@ AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKee
 			 * this function
 			 */
 			wk->state = flush_state;
-			break;
+			UpdateEventSet(wk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 				 wk->host, wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ShutdownConnection(i);
+			ShutdownConnection(wk);
+			return false;
+		default:
+		    Assert(false);
 			return false;
 	}
-
-	/* If the new state will be waiting for something, update the event set */
-	events = WalKeeperStateDesiredEvents(wk->state);
-	if (events)
-		UpdateEventSet(wk, events);
-
-	return true;
 }
 
 /*
  * Flushes a previous call to AsyncWrite. This only needs to be called when the
  * socket becomes read or write ready *after* calling AsyncWrite.
  *
- * If flushing completes, moves to 'success_state' and returns true. If more
- * flushes are needed, does nothing and returns true.
- *
- * On failure, emits a warning, resets the connection, and returns false.
+ * If flushing successfully completes returns true, otherwise false.
  */
 static bool
-AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
+AsyncFlush(int i, bool socket_read_ready)
 {
 	WalKeeper  *wk = &walkeeper[i];
-	uint32		events;
 
 	/*---
 	 * PQflush returns:
@@ -1787,27 +1974,21 @@ AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
 	switch (walprop_flush(wk->conn, socket_read_ready))
 	{
 		case 0:
-			/* On success, move to the next state - that logic is further down */
-			break;
+			UpdateEventSet(wk, WL_SOCKET_READABLE); /* flush is done, unset write interest */
+			return true;
 		case 1:
 			/* Nothing to do; try again when the socket's ready */
-			return true;
+			return false;
 		case -1:
 			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
 				 wk->host, wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ResetConnection(i);
+			ResetConnection(wk);
+			return false;
+		default:
+			Assert(false);
 			return false;
 	}
-
-	wk->state = success_state;
-
-	/* If the new state will be waiting for something, update the event set */
-	events = WalKeeperStateDesiredEvents(wk->state);
-	if (events)
-		UpdateEventSet(wk, events);
-
-	return true;
 }
 
 /*
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 16d84ac7f17..924b8fb1eb7 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -1,6 +1,7 @@
 #include "postgres.h"
 
 #include "replication/walproposer.h"
+#include "libpq/pqformat.h"
 #include "common/logging.h"
 #include "common/ip.h"
 #include "../interfaces/libpq/libpq-fe.h"
@@ -68,6 +69,9 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_WAIT_VERDICT:
 			return_val = "wait-for-verdict";
 			break;
+		case SS_SEND_ELECTED_FLUSH:
+			return_val = "send-announcement-flush";
+			break;
 		case SS_IDLE:
 			return_val = "idle";
 			break;
@@ -151,6 +155,7 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 			result = WL_NO_EVENTS;
 			break;
 		/* but flushing does require read- or write-ready */
+		case SS_SEND_ELECTED_FLUSH:
 		case SS_SEND_WAL_FLUSH:
 			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
 			break;
@@ -266,3 +271,49 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 
 	return true;
 }
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
\ No newline at end of file
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 2e32e0f0f7c..30d8d72256c 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -148,6 +148,9 @@ typedef enum
 	 */
 	SS_WAIT_VERDICT,
 
+	/* need to flush ProposerAnnouncement */
+	SS_SEND_ELECTED_FLUSH,
+
 	/*
 	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
 	 * read-ready, the connection has been closed.
@@ -180,7 +183,7 @@ typedef enum
 typedef uint64 term_t;
 
 /*
- * Proposer -> Acceptor messaging.
+ * Proposer <-> Acceptor messaging.
  */
 
 /* Initial Proposer -> Acceptor message */
@@ -197,6 +200,11 @@ typedef struct ProposerGreeting
 	uint32	   walSegSize;
 } ProposerGreeting;
 
+typedef struct AcceptorProposerMessage
+{
+	uint64 tag;
+} AcceptorProposerMessage;
+
 /*
  * Acceptor -> Proposer initial response: the highest term acceptor voted for.
  */
@@ -216,17 +224,47 @@ typedef struct VoteRequest
 	pg_uuid_t   proposerId; /* for monitoring/debugging */
 } VoteRequest;
 
+/* Element of term switching chain. */
+typedef struct TermSwitchEntry
+{
+	term_t term;
+	XLogRecPtr lsn;
+} TermSwitchEntry;
+
+typedef struct TermHistory
+{
+	uint32 n_entries;
+	TermSwitchEntry *entries;
+} TermHistory;
+
 /* Vote itself, sent from safekeeper to proposer */
 typedef struct VoteResponse {
-	uint64 tag;
-	term_t term; /* not really needed, just adds observability */
+	AcceptorProposerMessage apm;
+	term_t term;
 	uint64 voteGiven;
-    /// Safekeeper's log position, to let proposer choose the most advanced one
-	term_t epoch;
+	/*
+	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
+     * proposer to choose the most advanced one.
+	 */
 	XLogRecPtr flushLsn;
 	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+	TermHistory termHistory;
 } VoteResponse;
 
+/*
+ * Proposer -> Acceptor message announcing proposer is elected and communicating
+ * epoch history to it.
+ */
+typedef struct ProposerElected
+{
+	uint64 tag;
+	term_t term;
+	/* proposer will send since this point */
+	XLogRecPtr startStreamingAt;
+	/* history of term switches up to this proposer */
+	TermHistory *termHistory;
+} ProposerElected;
+
 /*
  * Header of request with WAL message sent from proposer to walkeeper.
  */
@@ -289,7 +327,6 @@ typedef struct AppendResponse
 	 */
 	uint64 tag;
 	term_t     term;
-	term_t epoch;
 	XLogRecPtr flushLsn;
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
@@ -316,6 +353,7 @@ typedef struct WalKeeper
 	 * reach SS_SEND_WAL; not before.
 	 */
 	WalProposerConn*   conn;
+	StringInfoData outbuf;
 
 	WalMessage*        currMsg;       /* message been send to the receiver */
 
@@ -325,8 +363,7 @@ typedef struct WalKeeper
 	VoteResponse	   voteResponse;  /* the vote */
 	AppendResponse feedback;		  /* feedback to master */
 	/*
-	 * streaming must be started at the record boundary which is saved here, if
-	 * it differs from the chunk start
+	 * Streaming will start here; must be record boundary.
 	 */
 	XLogRecPtr startStreamingAt;
 } WalKeeper;
@@ -341,6 +378,10 @@ char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
 bool       HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32     pq_getmsgint32_le(StringInfo msg);
+uint64     pq_getmsgint64_le(StringInfo msg);
+void	   pq_sendint32_le(StringInfo buf, uint32 i);
+void	   pq_sendint64_le(StringInfo buf, uint64 i);
 void       WalProposerPoll(void);
 void       WalProposerRegister(void);
 void       ProcessStandbyReply(XLogRecPtr	writePtr,

From bd291b070b7eb39e98bd6fc80398cd4950fc7d03 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Wed, 10 Nov 2021 00:09:13 +0300
Subject: [PATCH 077/214] Clarify the meaning of StandbyReply LSNs used for
 backpressure

---
 src/backend/access/transam/xloginsert.c |  8 +++++---
 src/backend/replication/walproposer.c   | 10 ++++++++--
 src/backend/replication/walsender.c     |  6 +++++-
 src/include/replication/walproposer.h   |  3 ++-
 src/include/replication/walsender.h     |  2 +-
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 5f3e1ff9c9e..cc747c7df13 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -152,12 +152,14 @@ XLogBeginInsert(void)
 		{
 			XLogRecPtr replicaWriteLsn;
 			XLogRecPtr replicaFlushLsn;
+			XLogRecPtr replicaApplyLsn;
 			XLogRecPtr myFlushLsn = GetFlushRecPtr();
 
-			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn);
+			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn, &replicaApplyLsn);
 
-			if ((replicaWriteLsn != UnknownXLogRecPtr
-				 && myFlushLsn > replicaWriteLsn + max_replication_write_lag*MB) ||
+			//TODO: rename max_replication_write_lag to max_replication_apply_lag ?
+			if ((replicaApplyLsn != UnknownXLogRecPtr
+				 && myFlushLsn > replicaApplyLsn + max_replication_write_lag*MB) ||
 				(replicaFlushLsn != UnknownXLogRecPtr
 				 && myFlushLsn > replicaFlushLsn + max_replication_flush_lag*MB))
 			{
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 3defed7f9ab..720c5138f8d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -395,9 +395,15 @@ HandleWalKeeperResponse(void)
 
 		/* advance the replication slot */
 		if (!syncSafekeepers)
-			ProcessStandbyReply(lastFeedback.diskConsistentLsn,
+			ProcessStandbyReply(
+								// write_lsn
+								// Not used, because we use SYNCHRONOUS_COMMIT_REMOTE_FLUSH.
 								lastFeedback.flushLsn,
-								InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+								//flush_lsn - This is what durably stored in WAL service.
+								lastFeedback.flushLsn,
+								//apply_lsn - This is what processed and durably saved at pageserver.
+								lastFeedback.diskConsistentLsn,
+								GetCurrentTimestamp(), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback);
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index ecda7c45791..f0969699c4e 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3783,10 +3783,11 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
  * Get minimal write and flush LSN among all live replicas
  */
 void
-GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn)
+GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply_lsn)
 {
 	XLogRecPtr min_write_lsn = UnknownXLogRecPtr;
 	XLogRecPtr min_flush_lsn = UnknownXLogRecPtr;
+	XLogRecPtr min_apply_lsn = UnknownXLogRecPtr;
 	for (int i = 0; i < max_wal_senders; i++)
 	{
 		WalSnd	   *walsnd = &WalSndCtl->walsnds[i];
@@ -3799,11 +3800,14 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn)
 			 */
 			XLogRecPtr written = walsnd->write;
 			XLogRecPtr flushed = walsnd->flush;
+			XLogRecPtr applied = walsnd->apply;
 			min_write_lsn = Min(written, min_write_lsn);
 			min_flush_lsn = Min(flushed, min_flush_lsn);
+			min_apply_lsn = Min(applied, min_apply_lsn);
 		}
 	}
 	*write_lsn = min_write_lsn;
 	*flush_lsn = min_flush_lsn;
+	*apply_lsn = min_apply_lsn;
 }
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 30d8d72256c..59f4d73ed6d 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -327,11 +327,12 @@ typedef struct AppendResponse
 	 */
 	uint64 tag;
 	term_t     term;
+	// TODO: add comment
 	XLogRecPtr flushLsn;
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
 	XLogRecPtr commitLsn;
-	// Part of WALL applied and written to the disk by all pageservers
+	// Part of WAL applied and written to the disk by all pageservers
 	XLogRecPtr diskConsistentLsn;
 	HotStandbyFeedback hs;
 } AppendResponse;
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index bd2f9ad6d28..2ea0cbd69bc 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -48,7 +48,7 @@ extern void WalSndInitStopping(void);
 extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
-extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush);
+extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply);
 /*
  * Remember that we want to wakeup walsenders later
  *

From e23594ac667c6ab15ca6a4afb7bc5e6a99f501b0 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Thu, 11 Nov 2021 16:20:07 +0300
Subject: [PATCH 078/214] Use max_replication_apply_lag instead of
 max_replication_write_lag. Move backpressure throttling from XlogInsert, to
 ProcessInterrupts(), to restrict writing operations outside of critical
 section.

---
 src/backend/access/transam/xloginsert.c | 39 +------------------
 src/backend/storage/buffer/bufmgr.c     | 51 +++++++++++++++++++++++++
 src/backend/utils/misc/guc.c            |  8 ++--
 src/include/access/xloginsert.h         |  2 +-
 4 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index cc747c7df13..4779a81c4d1 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -64,7 +64,7 @@ typedef struct
 } registered_buffer;
 
 /* GUCs */
-int			max_replication_write_lag;
+int			max_replication_apply_lag;
 int			max_replication_flush_lag;
 
 static registered_buffer *registered_buffers;
@@ -122,7 +122,6 @@ static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
 /* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */
-#define BACK_PRESSURE_TIMEOUT 100
 #define MB ((XLogRecPtr)1024*1024)
 
 /*
@@ -143,42 +142,6 @@ XLogBeginInsert(void)
 	if (begininsert_called)
 		elog(ERROR, "XLogBeginInsert was already called");
 
-	if (max_replication_write_lag != 0 || max_replication_flush_lag != 0)
-	{
-		uint64 slept = 0;
-
-		/* Suspend writes until replicas catch up */
-		while (true)
-		{
-			XLogRecPtr replicaWriteLsn;
-			XLogRecPtr replicaFlushLsn;
-			XLogRecPtr replicaApplyLsn;
-			XLogRecPtr myFlushLsn = GetFlushRecPtr();
-
-			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn, &replicaApplyLsn);
-
-			//TODO: rename max_replication_write_lag to max_replication_apply_lag ?
-			if ((replicaApplyLsn != UnknownXLogRecPtr
-				 && myFlushLsn > replicaApplyLsn + max_replication_write_lag*MB) ||
-				(replicaFlushLsn != UnknownXLogRecPtr
-				 && myFlushLsn > replicaFlushLsn + max_replication_flush_lag*MB))
-			{
-				(void) WaitLatch(MyLatch,
-								 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-								 BACK_PRESSURE_TIMEOUT,
-								 WAIT_EVENT_BACK_PRESSURE);
-				ResetLatch(MyLatch);
-				slept += BACK_PRESSURE_TIMEOUT;
-			}
-			else
-				break;
-		}
-
-		// XXX: INFO will cause a lot of regression tests to fail.
-		if (slept > 0)
-			elog(DEBUG1, "slept for " UINT64_FORMAT " ms while waiting for all replicas to catch up", slept);
-	}
-
 	begininsert_called = true;
 }
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index fadcc1b6255..226162be660 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -54,6 +54,7 @@
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
+#include "replication/walsender.h"
 
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -4056,6 +4057,41 @@ UnlockBuffers(void)
 	}
 }
 
+// Check if we need to suspend inserts because of lagging replication.
+static uint64
+backpressureThrottle()
+{
+	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
+	{
+		XLogRecPtr writePtr;
+		XLogRecPtr flushPtr;
+		XLogRecPtr applyPtr;
+		XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
+		#define MB ((XLogRecPtr)1024*1024)
+
+		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
+			LSN_FORMAT_ARGS(myFlushLsn),
+			LSN_FORMAT_ARGS(writePtr),
+			LSN_FORMAT_ARGS(flushPtr),
+			LSN_FORMAT_ARGS(applyPtr));
+
+		if ((flushPtr != UnknownXLogRecPtr
+			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
+		{
+			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
+		}
+
+		if ((applyPtr != UnknownXLogRecPtr
+			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
+		{
+			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
+		}
+	}
+	return 0;
+}
+
 /*
  * Acquire or release the content_lock for the buffer.
  */
@@ -4075,7 +4111,22 @@ LockBuffer(Buffer buffer, int mode)
 	else if (mode == BUFFER_LOCK_SHARE)
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
+	{
+		// Suspend writes until replicas catch up
+		uint64 lag = backpressureThrottle();
+		while (lag > 0)
+		{
+			elog(DEBUG2, "BackpressureThrottle LockBuffer(LW_EXCLUSIVE): lag %lu", lag);
+			#define BACK_PRESSURE_TIMEOUT 10000L // 0.01 sec
+			pg_usleep(BACK_PRESSURE_TIMEOUT);
+			lag = backpressureThrottle();
+
+			// We can hang here for a while. Don't block cancel requests.
+			CHECK_FOR_INTERRUPTS();
+		}
+
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9cb96b4a950..9158f475dbc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2913,13 +2913,13 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+		{"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Maximal write lag between master and replicas."),
-			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
-						 "backends are blocked"),
+			gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value,"
+						 "backends are blocked."),
 			GUC_UNIT_MB,
 		},
-		&max_replication_write_lag,
+		&max_replication_apply_lag,
 		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 699ca56ed25..45dcaf99d9e 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -38,7 +38,7 @@
 #define REGBUF_KEEP_DATA	0x10	/* include data even if a full-page image
 									 * is taken */
 
-extern int max_replication_write_lag;
+extern int max_replication_apply_lag;
 extern int max_replication_flush_lag;
 
 /* prototypes for public functions in xloginsert.c: */

From e5ce1303d50ff4faf0fc3ea0becf8f0c4114f6a4 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Thu, 25 Nov 2021 18:58:39 +0300
Subject: [PATCH 079/214] Forward pageserver connection string to safekeeper

This is needed for implementation of tenant rebalancing. With this
change safekeeper becomes aware of which pageserver is supposed to be
used for replication from this compute.

This also changes logic of substitution of auth token inside the
connection string. So it is substituted during config variable
parsing and available for both, smgr pageserver connection and
walproposer safekeeper connection.
---
 contrib/zenith/libpagestore.c         | 187 ++++++++++++++------------
 contrib/zenith/pagestore_smgr.c       |   2 +-
 src/backend/replication/walproposer.c |  36 +++--
 src/include/replication/walproposer.h |   1 +
 4 files changed, 128 insertions(+), 98 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 1dc708f0ad7..2caf5d74b6e 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -42,6 +42,8 @@ void		_PG_init(void);
 bool		connected = false;
 PGconn	   *pageserver_conn;
 
+char	   *page_server_connstring_raw;
+
 static ZenithResponse *zenith_call(ZenithRequest *request);
 page_server_api api = {
 	.request = zenith_call
@@ -52,93 +54,8 @@ zenith_connect()
 {
 	char	   *query;
 	int			ret;
-	char	   *auth_token;
-	char	   *err = NULL;
-	PQconninfoOption *conn_options;
-	PQconninfoOption *conn_option;
-	int			noptions = 0;
-
-	/* this is heavily inspired by psql/command.c::do_connect */
-	conn_options = PQconninfoParse(page_server_connstring, &err);
-
-	if (conn_options == NULL)
-	{
-		/* The error string is malloc'd, so we must free it explicitly */
-		char	   *errcopy = err ? pstrdup(err) : "out of memory";
 
-		PQfreemem(err);
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid connection string syntax: %s", errcopy)));
-	}
-
-	/*
-	 * Trying to populate pageserver connection string with auth token from
-	 * environment. We are looking for password in with placeholder value like
-	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
-	 * to fetch environment variable value and fail loudly if it is not set.
-	 */
-	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
-	{
-		noptions++;
-		if (strcmp(conn_option->keyword, "password") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-			{
-				/* ensure that this is a template */
-				if (strncmp(conn_option->val, "$", 1) != 0)
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
-
-				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
-				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token)
-				{
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
-				}
-				else
-				{
-					zenith_log(LOG, "using auth token from environment passed via env");
-
-					/*
-					 * inspired by PQconninfoFree and conninfo_storeval so
-					 * just free the old one and replace with freshly
-					 * malloc'ed one
-					 */
-					free(conn_option->val);
-					conn_option->val = strdup(auth_token);
-				}
-			}
-		}
-	}
-
-	/*
-	 * copy values from PQconninfoOption to key/value arrays because
-	 * PQconnectdbParams accepts options this way
-	 */
-	{
-		const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
-		const char **values = malloc((noptions + 1) * sizeof(*values));
-		int			i = 0;
-
-		for (i = 0; i < noptions; i++)
-		{
-			keywords[i] = conn_options[i].keyword;
-			values[i] = conn_options[i].val;
-		}
-		/* add array terminator */
-		keywords[i] = NULL;
-		values[i] = NULL;
-
-		pageserver_conn = PQconnectdbParams(keywords, values, false);
-		free(keywords);
-		free(values);
-	}
-
-	PQconninfoFree(conn_options);
+	pageserver_conn = PQconnectdb(page_server_connstring);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -197,6 +114,7 @@ zenith_connect()
 		}
 	}
 
+	// FIXME: when auth is enabled this ptints JWT to logs
 	zenith_log(LOG, "libpqpagestore: connected to '%s'", page_server_connstring);
 
 	connected = true;
@@ -276,6 +194,96 @@ check_zenith_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(zid, *newval, 16);
 }
 
+static char *
+substitute_pageserver_password(const char *page_server_connstring_raw)
+{
+	char	   *host = NULL;
+	char	   *port = NULL;
+	char	   *user = NULL;
+	char	   *auth_token = NULL;
+	char	   *err = NULL;
+	char	   *page_server_connstring = NULL;
+	PQconninfoOption *conn_options;
+	PQconninfoOption *conn_option;
+	MemoryContext oldcontext;
+	/*
+	 * Here we substitute password in connection string with an environment variable.
+	 * To simplify things we construct a connection string back with only known options.
+	 * In particular: host port user and password. We do not currently use other options and
+	 * constructing full connstring in an URI shape is quite messy.
+	 */
+
+	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
+		return NULL;
+
+	/* extract the auth token from the connection string */
+	conn_options = PQconninfoParse(page_server_connstring_raw, &err);
+	if (conn_options == NULL)
+	{
+		/* The error string is malloc'd, so we must free it explicitly */
+		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+
+		PQfreemem(err);
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid connection string syntax: %s", errcopy)));
+	}
+
+	/*
+	 * Trying to populate pageserver connection string with auth token from
+	 * environment. We are looking for password in with placeholder value like
+	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
+	 * to fetch environment variable value and fail loudly if it is not set.
+	 */
+	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
+	{
+		if (strcmp(conn_option->keyword, "host") == 0) {
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				host = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "port") == 0) {
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				port = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "user") == 0) {
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				user = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "password") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+			{
+				/* ensure that this is a template */
+				if (strncmp(conn_option->val, "$", 1) != 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
+
+				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
+				auth_token = getenv(&conn_option->val[1]);
+				if (!auth_token)
+				{
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
+				}
+				else
+				{
+					zenith_log(LOG, "using auth token from environment passed via env");
+				}
+			}
+		}
+	}
+	// allocate connection string in a TopMemoryContext to make sure it is not freed
+	oldcontext = CurrentMemoryContext;
+	MemoryContextSwitchTo(TopMemoryContext);
+	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
+	MemoryContextSwitchTo(oldcontext);
+
+	PQconninfoFree(conn_options);
+	return page_server_connstring;
+}
+
 /*
  * Module initialization function
  */
@@ -285,7 +293,7 @@ _PG_init(void)
 	DefineCustomStringVariable("zenith.page_server_connstring",
 							   "connection string to the page server",
 							   NULL,
-							   &page_server_connstring,
+							   &page_server_connstring_raw,
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
@@ -335,9 +343,14 @@ _PG_init(void)
 	zenith_log(PqPageStoreTrace, "libpqpagestore already loaded");
 	page_server = &api;
 
+	/* substitute password in pageserver_connstring */
+	page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
+
 	/* Is there more correct way to pass CustomGUC to postgres code? */
 	zenith_timeline_walproposer = zenith_timeline;
 	zenith_tenant_walproposer = zenith_tenant;
+	/* Walproposer instructcs safekeeper which pageserver to use for replication */
+	zenith_pageserver_connstring_walproposer = page_server_connstring;
 
 	if (wal_redo)
 	{
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 25ad896491b..81aa2339779 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -86,7 +86,7 @@ const int	SmgrTrace = DEBUG5;
 page_server_api *page_server;
 
 /* GUCs */
-char	   *page_server_connstring;
+char	   *page_server_connstring; // with substituted password
 char	   *callmemaybe_connstring;
 char	   *zenith_timeline;
 char	   *zenith_tenant;
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 720c5138f8d..e0fbd653f92 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -283,9 +283,14 @@ ResetConnection(WalKeeper *wk)
 	 */
 	if (wk->conninfo[0] == '\0')
 	{
-		sprintf((char *) &wk->conninfo,
+		int written = 0;
+		written = snprintf((char *) &wk->conninfo, MAXCONNINFO,
 				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
 				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+		// currently connection string is not that long, but once we pass something like jwt we might overflow the buffer,
+		// so it is better to be defensive and check that everything aligns well
+		if (written > MAXCONNINFO || written < 0)
+			elog(FATAL, "could not create connection string for walkeeper %s:%s", wk->host, wk->port);
 	}
 
 	wk->conn = walprop_connect_start((char *) &wk->conninfo);
@@ -495,6 +500,7 @@ HandleWalKeeperResponse(void)
 
 char	   *zenith_timeline_walproposer = NULL;
 char	   *zenith_tenant_walproposer = NULL;
+char	   *zenith_pageserver_connstring_walproposer = NULL;
 
 
 static void
@@ -566,6 +572,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	if (*zenith_tenant_walproposer != '\0' &&
 		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+
 	proposerGreeting.timeline = ThisTimeLineID;
 	proposerGreeting.walSegSize = wal_segment_size;
 
@@ -1337,18 +1344,27 @@ AdvancePollState(int i, uint32 events)
 				 * sending, wait for response with SS_WAIT_EXEC_RESULT
 				 */
 			case SS_EXEC_STARTWALPUSH:
-				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
 				{
-					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-						 wk->host, wk->port, walprop_error_message(wk->conn));
-					ShutdownConnection(wk);
-					return;
+					char *query = NULL;
+					if (zenith_pageserver_connstring_walproposer != NULL) {
+						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
+					} else {
+						query = psprintf("START_WAL_PUSH");
+					}
+					if (!walprop_send_query(wk->conn, query))
+					{
+						pfree(query);
+						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+						ShutdownConnection(wk);
+						return;
+					}
+					pfree(query);
+					wk->state = SS_WAIT_EXEC_RESULT;
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+					break;
 				}
 
-				wk->state = SS_WAIT_EXEC_RESULT;
-				UpdateEventSet(wk, WL_SOCKET_READABLE);
-				break;
-
 			case SS_WAIT_EXEC_RESULT:
 				switch (walprop_get_query_result(wk->conn))
 				{
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 59f4d73ed6d..2b6d281ec2a 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -36,6 +36,7 @@ typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
 extern char *zenith_tenant_walproposer;
+extern char	*zenith_pageserver_connstring_walproposer;
 
 /* Possible return values from ReadPGAsync */
 typedef enum

From a0c96c876de7342a8389ebba64430d58e01e313b Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Fri, 26 Nov 2021 12:10:10 +0300
Subject: [PATCH 080/214] Move backpressure throttling to ProcessInterrupts()

---
 src/backend/access/transam/xloginsert.c |  5 +++
 src/backend/replication/walsender.c     | 34 +++++++++++++++++
 src/backend/storage/buffer/bufmgr.c     | 50 -------------------------
 src/backend/tcop/postgres.c             | 34 ++++++++++++++++-
 src/include/replication/walsender.h     |  1 +
 5 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 4779a81c4d1..306841fb8d2 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -460,6 +460,11 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
+	if (backpressure_lag() > 0)
+	{
+		InterruptPending = true;
+	}
+
 	do
 	{
 		XLogRecPtr	RedoRecPtr;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f0969699c4e..c3b765e120e 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3811,3 +3811,37 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply
 	*apply_lsn = min_apply_lsn;
 }
 
+// Check if we need to suspend inserts because of lagging replication.
+uint64
+backpressure_lag(void)
+{
+	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
+	{
+		XLogRecPtr writePtr;
+		XLogRecPtr flushPtr;
+		XLogRecPtr applyPtr;
+		XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
+		#define MB ((XLogRecPtr)1024*1024)
+
+		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
+			LSN_FORMAT_ARGS(myFlushLsn),
+			LSN_FORMAT_ARGS(writePtr),
+			LSN_FORMAT_ARGS(flushPtr),
+			LSN_FORMAT_ARGS(applyPtr));
+
+		if ((flushPtr != UnknownXLogRecPtr
+			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
+		{
+			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
+		}
+
+		if ((applyPtr != UnknownXLogRecPtr
+			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
+		{
+			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
+		}
+	}
+	return 0;
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 226162be660..b96e033e53c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4057,41 +4057,6 @@ UnlockBuffers(void)
 	}
 }
 
-// Check if we need to suspend inserts because of lagging replication.
-static uint64
-backpressureThrottle()
-{
-	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
-	{
-		XLogRecPtr writePtr;
-		XLogRecPtr flushPtr;
-		XLogRecPtr applyPtr;
-		XLogRecPtr myFlushLsn = GetFlushRecPtr();
-
-		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
-		#define MB ((XLogRecPtr)1024*1024)
-
-		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
-			LSN_FORMAT_ARGS(myFlushLsn),
-			LSN_FORMAT_ARGS(writePtr),
-			LSN_FORMAT_ARGS(flushPtr),
-			LSN_FORMAT_ARGS(applyPtr));
-
-		if ((flushPtr != UnknownXLogRecPtr
-			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
-		{
-			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
-		}
-
-		if ((applyPtr != UnknownXLogRecPtr
-			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
-		{
-			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
-		}
-	}
-	return 0;
-}
-
 /*
  * Acquire or release the content_lock for the buffer.
  */
@@ -4111,22 +4076,7 @@ LockBuffer(Buffer buffer, int mode)
 	else if (mode == BUFFER_LOCK_SHARE)
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-	{
-		// Suspend writes until replicas catch up
-		uint64 lag = backpressureThrottle();
-		while (lag > 0)
-		{
-			elog(DEBUG2, "BackpressureThrottle LockBuffer(LW_EXCLUSIVE): lag %lu", lag);
-			#define BACK_PRESSURE_TIMEOUT 10000L // 0.01 sec
-			pg_usleep(BACK_PRESSURE_TIMEOUT);
-			lag = backpressureThrottle();
-
-			// We can hang here for a while. Don't block cancel requests.
-			CHECK_FOR_INTERRUPTS();
-		}
-
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
-	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index c613495ee2e..1c3974aedd1 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3138,8 +3138,8 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
  * return; another interrupt could have arrived.  But we promise that
  * any pre-existing one will have been serviced.)
  */
-void
-ProcessInterrupts(void)
+static void
+ProcessInterrupts_pg(void)
 {
 	/* OK to accept any interrupts now? */
 	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
@@ -3381,6 +3381,36 @@ ProcessInterrupts(void)
 		ProcessLogMemoryContextInterrupt();
 }
 
+void
+ProcessInterrupts(void)
+{
+	uint64 lag;
+
+	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
+		return;
+
+	// Don't throttle read only transactions
+	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	{
+		ProcessInterrupts_pg();
+		return;
+	}
+
+	#define BACK_PRESSURE_DELAY 10000L // 0.01 sec
+	while(true)
+	{
+		ProcessInterrupts_pg();
+
+		// Suspend writers until replicas catch up
+		lag = backpressure_lag();
+		if (lag <= 0)
+			break;
+
+		elog(DEBUG2, "backpressure throttling: lag %lu", lag);
+		pg_usleep(BACK_PRESSURE_DELAY);
+	}
+}
+
 
 /*
  * IA64-specific code to fetch the AR.BSP register for stack depth checks.
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 2ea0cbd69bc..fe21617994a 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -49,6 +49,7 @@ extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
 extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply);
+extern uint64 backpressure_lag(void);
 /*
  * Remember that we want to wakeup walsenders later
  *

From 873574ff8a6a5ae4a2b9772e41c02be2a8aec1d5 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 15 Dec 2021 16:10:03 +0300
Subject: [PATCH 081/214] Stop building docker images in this repo.

Now docker images are being built in zenith repo as that way we have
sequential version number that allows us to compare compute/storage
versions.
---
 .circleci/config.yml | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 16a271b0386..00000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-version: 2.1
-
-jobs:
-
-  # Build zenithdb/compute-node:latest image and push it to Docker hub
-  docker_image:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Build and push Docker image
-          command: |
-            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/compute-node:latest . && docker push zenithdb/compute-node:latest
-
-workflows:
-  version: 2
-  compute_node:
-    jobs:
-      # Build and push image only for commits to `main`.
-      - docker_image:
-          # Context gives an ability to login
-          context: Docker Hub
-          filters:
-            branches:
-              only:
-                - main

From 7c5e02856922b74f566ab959f53035bc14171d30 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 21 Dec 2021 16:51:22 +0300
Subject: [PATCH 082/214] [walproposer] Async WAL append (#105)

Implement async wp <-> sk protocol, send WAL messages ahead of feedback replies.

New SS_ACTIVE state is introduced instead of former SS_SEND_WAL / SS_SEND_WAL_FLUSH / SS_RECV_FEEDBACK.
---
 .../libpqwalproposer/libpqwalproposer.c       |   9 +-
 src/backend/replication/walproposer.c         | 357 +++++++++++-------
 src/backend/replication/walproposer_utils.c   |  15 +-
 src/include/replication/walproposer.h         |  38 +-
 4 files changed, 230 insertions(+), 189 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index f538ed9133f..177c93eb85d 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -22,7 +22,7 @@ static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn*
 static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
 static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
 static pgsocket							libpqprop_socket(WalProposerConn* conn);
-static int								libpqprop_flush(WalProposerConn* conn, bool socket_read_ready);
+static int								libpqprop_flush(WalProposerConn* conn);
 static void								libpqprop_finish(WalProposerConn* conn);
 static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
 static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
@@ -239,13 +239,8 @@ libpqprop_socket(WalProposerConn* conn)
 }
 
 static int
-libpqprop_flush(WalProposerConn* conn, bool socket_read_ready)
+libpqprop_flush(WalProposerConn* conn)
 {
-	/* If the socket is read-ready, we have to call PQconsumeInput before
-	 * calling PQflush (according to libpq docs) */
-	if (socket_read_ready && !PQconsumeInput(conn->pg_conn))
-		return -1; /* return failure if PQconsumeInput fails */
-
 	return (PQflush(conn->pg_conn));
 }
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index e0fbd653f92..99a77aba280 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -102,7 +102,7 @@ static bool AsyncReadFixed(int i, void *value, size_t value_size);
 static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
 static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
-static bool AsyncFlush(int i, bool socket_read_ready);
+static bool AsyncFlush(WalKeeper *wk);
 static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
@@ -110,6 +110,7 @@ static term_t GetHighestTerm(TermHistory *th);
 static term_t GetEpoch(WalKeeper *wk);
 static void SendProposerElected(WalKeeper *wk);
 static void StartStreaming(WalKeeper *wk);
+static bool SendAppendRequests(WalKeeper *wk);
 
 
 /*
@@ -236,8 +237,9 @@ HackyRemoveWalProposerEvent(WalKeeper *to_remove)
 			continue;
 
 		/* If this WAL keeper isn't offline, add an event for it! */
-		if ((desired_events = WalKeeperStateDesiredEvents(wk->state)))
+		if (wk->conn != NULL)
 		{
+			desired_events = WalKeeperStateDesiredEvents(wk->state);
 			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
 		}
 	}
@@ -251,7 +253,10 @@ ShutdownConnection(WalKeeper *wk)
 		walprop_finish(wk->conn);
 	wk->conn = NULL;
 	wk->state = SS_OFFLINE;
+	wk->flushWrite = false;
 	wk->currMsg = NULL;
+	wk->ackMsg = NULL;
+
 	if (wk->voteResponse.termHistory.entries)
 		pfree(wk->voteResponse.termHistory.entries);
 	wk->voteResponse.termHistory.entries = NULL;
@@ -546,7 +551,9 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		initStringInfo(&walkeeper[n_walkeepers].outbuf);
+		walkeeper[n_walkeepers].flushWrite = false;
 		walkeeper[n_walkeepers].currMsg = NULL;
+		walkeeper[n_walkeepers].ackMsg = NULL;
 		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_walkeepers += 1;
 	}
@@ -699,7 +706,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 }
 
 /*
- * Send message to the particular node
+ * Start sending message to the particular node.
  *
  * Always updates the state and event set for the WAL keeper; setting either of
  * these before calling would be redundant work.
@@ -720,27 +727,11 @@ SendMessageToNode(int i, WalMessage *msg)
 		msg = msg->next;
 
 	wk->currMsg = msg;
+	wk->flushWrite = false;
 
-	/* Only try to send the message if it's non-null */
-	if (wk->currMsg)
-	{
-		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
-		wk->currMsg->req.truncateLsn = truncateLsn;
-
-		/*
-		 * Once we've selected and set up our message, actually start sending
-		 * it.
-		 */
-		wk->state = SS_SEND_WAL;
-		/* Don't ned to update the event set; that's done by AdvancePollState */
-
-		AdvancePollState(i, WL_NO_EVENTS);
-	}
-	else
-	{
-		wk->state = SS_IDLE;
-		UpdateEventSet(wk, WL_SOCKET_READABLE);
-	}
+	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
+	if (!SendAppendRequests(wk))
+		return;
 }
 
 /*
@@ -751,7 +742,7 @@ BroadcastMessage(WalMessage *msg)
 {
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE && walkeeper[i].currMsg == NULL)
+		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
 		{
 			SendMessageToNode(i, msg);
 		}
@@ -1154,13 +1145,20 @@ SendProposerElected(WalKeeper *wk)
 }
 
 /*
- * Start streaming to safekeeper wk.
+ * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
  */
 static void
 StartStreaming(WalKeeper *wk)
 {
 	int wki = wk - walkeeper;
 
+	/* 
+	 * This is the only entrypoint to state SS_ACTIVE. It's executed
+	 * exactly once for a connection.
+	 */
+	wk->state = SS_ACTIVE;
+	UpdateEventSet(wk, WL_SOCKET_READABLE);
+
 	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
 		if (msg->req.endLsn <= wk->startStreamingAt)
@@ -1174,7 +1172,6 @@ StartStreaming(WalKeeper *wk)
 			return;
 		}
 	}
-	wk->state = SS_IDLE; /* nothing to send yet, safekeeper is recovered */
 }
 
 /*
@@ -1233,13 +1230,184 @@ WalProposerPoll(void)
 	}
 }
 
+/*
+ * Send queue messages starting from wk->currMsg until the end or non-writable
+ * socket, whichever comes first.
+ * 
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ */
+static bool
+SendAppendRequests(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+	WalMessage *msg;
+	AppendRequestHeader *req;
+
+	if (wk->flushWrite)
+	{
+		if (!AsyncFlush(wk))
+			/* 
+			 * AsyncFlush failed, that could happen if the socket is closed or
+			 * we have nothing to write and should wait for writeable socket.
+			 */
+			return wk->state == SS_ACTIVE;
+
+		wk->currMsg = wk->currMsg->next;
+		wk->flushWrite = false;
+	}
+
+	while (wk->currMsg)
+	{
+		msg = wk->currMsg;
+		req = &msg->req;
+
+		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+		req->truncateLsn = truncateLsn;
+
+		Assert((msg->ackMask & (1 << wki)) == 0);
+
+		/*
+		 * If we need to send this message not from the beginning,
+		 * form the cut version. Only happens for the first
+		 * message.
+		 */
+		if (wk->startStreamingAt > msg->req.beginLsn)
+		{
+			uint32		len;
+			uint32		size;
+
+			Assert(wk->startStreamingAt < req->endLsn);
+
+			len = msg->req.endLsn - wk->startStreamingAt;
+			size = sizeof(AppendRequestHeader) + len;
+			req = malloc(size);
+			*req = msg->req;
+			req->beginLsn = wk->startStreamingAt;
+			memcpy(req + 1,
+					(char *) (&msg->req + 1) + wk->startStreamingAt -
+					msg->req.beginLsn,
+					len);
+		}
+
+		elog(LOG,
+				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				req->endLsn - req->beginLsn,
+				LSN_FORMAT_ARGS(req->beginLsn),
+				LSN_FORMAT_ARGS(req->endLsn),
+				LSN_FORMAT_ARGS(req->commitLsn),
+				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+
+		/* if this is the first sent message, we should start processing feedback */
+		if (wk->ackMsg == NULL)
+			wk->ackMsg = wk->currMsg;
+
+		/*
+		 * We write with msg->size here because the body of the
+		 * message is stored after the end of the WalMessage
+		 * struct, in the allocation for each msg
+		 */
+		if (!AsyncWrite(wk, req,
+						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
+						SS_ACTIVE))
+		{
+			if (req != &msg->req)
+				free(req);
+			if (wk->state == SS_ACTIVE)
+			{
+				wk->flushWrite = true;
+				return true;
+			}
+			return false;
+		}
+		if (req != &msg->req)
+			free(req);
+
+		/* continue writing the next message */
+		wk->currMsg = wk->currMsg->next;
+	}
+
+	return true;
+}
+
+/*
+ * Receive and process all available feedback.
+ *
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ * 
+ * NB: This function can call SendMessageToNode and produce new messages.
+ */
+static bool
+RecvAppendResponses(WalKeeper *wk)
+{
+	XLogRecPtr	minQuorumLsn;
+	int wki = wk - walkeeper;
+	bool readAnything = false;
+
+	while (true)
+	{
+		/*
+		 * If our reading doesn't immediately succeed, any
+		 * necessary error handling or state setting is taken care
+		 * of. We can leave any other work until later.
+		 */
+		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+			break;
+
+		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
+
+		/*
+		 * We shouldn't read responses ahead of wk->currMsg, because that will
+		 * look like we are receiving responses for messages that haven't been
+		 * sent yet. This can happen when message was placed in a buffer in 
+		 * SendAppendRequests, but sent through a wire only with a flush inside
+		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 */
+		if (wk->ackMsg == wk->currMsg)
+		{
+			/* Couldn't happen without flush flag */
+			Assert(wk->flushWrite);
+
+			wk->currMsg = wk->currMsg->next;
+			wk->flushWrite = false;
+		}
+
+		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
+											* receiving of this
+											* message */
+
+		wk->ackMsg = wk->ackMsg->next;
+		readAnything = true;
+	}
+
+	if (!readAnything)
+		return wk->state == SS_ACTIVE;
+
+	HandleWalKeeperResponse();
+
+	/*
+	 * Also send the new commit lsn to all the walkeepers.
+	 *
+	 * FIXME: This is redundant for walkeepers that have other
+	 * outbound messages pending.
+	 */
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastSentCommitLsn)
+	{
+		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+		lastSentCommitLsn = minQuorumLsn;
+	}
+
+	return wk->state == SS_ACTIVE;
+}
+
 /* Performs the logic for advancing the state machine of the 'i'th walkeeper,
  * given that a certain set of events has occured. */
 static void
 AdvancePollState(int i, uint32 events)
 {
 	WalKeeper  *wk = &walkeeper[i];
-
 	/*
 	 * Keep advancing the state while either: (a) the event is still
 	 * unprocessed (usually because it's the first iteration of the loop), or
@@ -1405,7 +1573,7 @@ AdvancePollState(int i, uint32 events)
 						 * generic "something went wrong"
 						 */
 					case WP_EXEC_UNEXPECTED_SUCCESS:
-						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
+						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
 							 wk->host, wk->port);
 						ShutdownConnection(wk);
 						return;
@@ -1607,6 +1775,12 @@ AdvancePollState(int i, uint32 events)
 							SendProposerElected(&walkeeper[i]);
 					}
 
+					/* 
+					 * The proposer has been elected, and there will be no quorum waiting
+					 * after this point. There will be no safekeeper with state SS_IDLE
+					 * also, because that state is used only for quorum waiting.
+					 */
+
 					if (syncSafekeepers)
 					{
 						/*
@@ -1636,7 +1810,7 @@ AdvancePollState(int i, uint32 events)
 				 * the flush completes. If we still have more to do, we'll
 				 * wait until the next poll comes along.
 				 */
-				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
+				if (!AsyncFlush(wk))
 					return;
 				
 				StartStreaming(wk);
@@ -1654,123 +1828,18 @@ AdvancePollState(int i, uint32 events)
 				ResetConnection(wk);
 				break;
 
-				/*
-				 * Start to send the message at wk->currMsg. Triggered only by
-				 * calls to SendMessageToNode
-				 */
-			case SS_SEND_WAL:
-				{
-					WalMessage *msg = wk->currMsg;
-					AppendRequestHeader *req = &msg->req;
 
-					/*
-					 * If we need to send this message not from the beginning,
-					 * form the cut version. Only happens for the first
-					 * message.
-					 */
-					if (wk->startStreamingAt > msg->req.beginLsn)
-					{
-						uint32		len;
-						uint32		size;
-
-						Assert(wk->startStreamingAt < req->endLsn);
-
-						len = msg->req.endLsn - wk->startStreamingAt;
-						size = sizeof(AppendRequestHeader) + len;
-						req = malloc(size);
-						*req = msg->req;
-						req->beginLsn = wk->startStreamingAt;
-						memcpy(req + 1,
-							   (char *) (&msg->req + 1) + wk->startStreamingAt -
-							   msg->req.beginLsn,
-							   len);
-					}
-
-					elog(LOG,
-						 "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						 req->endLsn - req->beginLsn,
-						 LSN_FORMAT_ARGS(req->beginLsn),
-						 LSN_FORMAT_ARGS(req->endLsn),
-						 LSN_FORMAT_ARGS(req->commitLsn),
-						 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
-
-					/*
-					 * We write with msg->size here because the body of the
-					 * message is stored after the end of the WalMessage
-					 * struct, in the allocation for each msg
-					 */
-					if (!AsyncWrite(wk, req,
-									sizeof(AppendRequestHeader) + req->endLsn -
-									req->beginLsn,
-									SS_SEND_WAL_FLUSH))
-					{
-						if (req != &msg->req)
-							free(req);
+			case SS_ACTIVE:
+				if (events & WL_SOCKET_WRITEABLE)
+					if (!SendAppendRequests(wk))
 						return;
-					}
-					wk->state = SS_RECV_FEEDBACK;
-					if (req != &msg->req)
-						free(req);
-
-					break;
-				}
-
-				/* Flush the WAL message we're sending from SS_SEND_WAL */
-			case SS_SEND_WAL_FLUSH:
 
-				/*
-				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll
-				 * wait until the next poll comes along.
-				 */
-				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
-					return;
-
-				wk->state = SS_RECV_FEEDBACK;
-
-				break;
-
-				/*
-				 * Start to receive the feedback from a message sent via
-				 * SS_SEND_WAL
-				 */
-			case SS_RECV_FEEDBACK:
-				{
-					WalMessage *next;
-					XLogRecPtr	minQuorumLsn;
-
-					/*
-					 * If our reading doesn't immediately succeed, any
-					 * necessary error handling or state setting is taken care
-					 * of. We can leave any other work until later.
-					 */
-					if (!AsyncReadFixed(i, &wk->feedback, sizeof(wk->feedback)))
+				if (events & WL_SOCKET_READABLE)
+					if (!RecvAppendResponses(wk))
 						return;
 
-					next = wk->currMsg->next;
-					wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms
-													 * receiving of this
-													 * message */
-
-					wk->currMsg = NULL;
-					HandleWalKeeperResponse();
-					SendMessageToNode(i, next); /* Updates state & event set */
-
-					/*
-					 * Also send the new commit lsn to all the walkeepers.
-					 *
-					 * FIXME: This is redundant for walkeepers that have other
-					 * outbound messages pending.
-					 */
-					minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-
-					if (minQuorumLsn > lastSentCommitLsn)
-					{
-						BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-						lastSentCommitLsn = minQuorumLsn;
-					}
-					break;
-				}
+				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
+				break;
 		}
 
 		/*
@@ -1983,17 +2052,15 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
  * If flushing successfully completes returns true, otherwise false.
  */
 static bool
-AsyncFlush(int i, bool socket_read_ready)
+AsyncFlush(WalKeeper *wk)
 {
-	WalKeeper  *wk = &walkeeper[i];
-
 	/*---
 	 * PQflush returns:
 	 *   0 if successful                    [we're good to move on]
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
 	 */
-	switch (walprop_flush(wk->conn, socket_read_ready))
+	switch (walprop_flush(wk->conn))
 	{
 		case 0:
 			UpdateEventSet(wk, WL_SOCKET_READABLE); /* flush is done, unset write interest */
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 924b8fb1eb7..c61ab87db45 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -75,14 +75,8 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_IDLE:
 			return_val = "idle";
 			break;
-		case SS_SEND_WAL:
-			return_val = "WAL-sending";
-			break;
-		case SS_SEND_WAL_FLUSH:
-			return_val = "WAL-sending (flushing)";
-			break;
-		case SS_RECV_FEEDBACK:
-			return_val = "WAL-feedback-receiving";
+		case SS_ACTIVE:
+			return_val = "active";
 			break;
 	}
 
@@ -143,7 +137,6 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 		case SS_WAIT_EXEC_RESULT:
 		case SS_HANDSHAKE_RECV:
 		case SS_WAIT_VERDICT:
-		case SS_RECV_FEEDBACK:
 			result = WL_SOCKET_READABLE;
 			break;
 
@@ -151,12 +144,12 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 		case SS_EXEC_STARTWALPUSH:
 		case SS_HANDSHAKE_SEND:
 		case SS_SEND_VOTE:
-		case SS_SEND_WAL:
 			result = WL_NO_EVENTS;
 			break;
 		/* but flushing does require read- or write-ready */
 		case SS_SEND_ELECTED_FLUSH:
-		case SS_SEND_WAL_FLUSH:
+		/* Active state does both reading and writing to the socket */
+		case SS_ACTIVE:
 			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
 			break;
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 2b6d281ec2a..ca27df2d19b 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -70,10 +70,7 @@ typedef enum
 /*
  * WAL safekeeper state
  *
- * States are listed here in the order that they're executed - with the only
- * exception occuring from the "send WAL" cycle, which loops as:
- *
- *   SS_IDLE -> SS_SEND_WAL (+ flush) -> SS_RECV_FEEDBACK -> SS_IDLE/SS_SEND_WAL
+ * States are listed here in the order that they're executed.
  *
  * Most states, upon failure, will move back to SS_OFFLINE by calls to
  * ResetConnection or ShutdownConnection.
@@ -156,28 +153,15 @@ typedef enum
 	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
 	 * read-ready, the connection has been closed.
 	 *
-	 * Moves to SS_SEND_WAL only by calls to SendMessageToNode.
+	 * Moves to SS_ACTIVE only by calls to SendMessageToNode.
 	 */
 	SS_IDLE,
+
 	/*
-	 * Start sending the message at currMsg. This state is only ever reached
-	 * through calls to SendMessageToNode.
-	 *
-	 * Sending needs to flush; immediately moves to SS_SEND_WAL_FLUSH.
-	 */
-	SS_SEND_WAL,
-	/*
-	 * Flush the WAL message, repeated until successful. On success, moves to
-	 * SS_RECV_FEEDBACK.
-	 */
-	SS_SEND_WAL_FLUSH,
-	/*
-	 * Currently reading feedback from sending the WAL.
-	 *
-	 * After reading, moves to (SS_SEND_WAL or SS_IDLE) by calls to
-	 * SendMessageToNode.
+	 * Active phase, when we acquired quorum and have WAL to send or feedback
+	 * to read.
 	 */
-	SS_RECV_FEEDBACK,
+	SS_ACTIVE,
 } WalKeeperState;
 
 /* Consensus logical timestamp. */
@@ -352,12 +336,14 @@ typedef struct WalKeeper
 	 * postgres protocol connection to the WAL acceptor
 	 *
 	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
-	 * reach SS_SEND_WAL; not before.
+	 * reach SS_ACTIVE; not before.
 	 */
 	WalProposerConn*   conn;
 	StringInfoData outbuf;
 
+	bool               flushWrite;    /* set to true if we wrote currMsg, but still need to call AsyncFlush */
 	WalMessage*        currMsg;       /* message been send to the receiver */
+	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
 	WalKeeperState     state;         /* walkeeper state machine state */
@@ -470,7 +456,7 @@ typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerCon
 typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
 
 /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-typedef int (*walprop_flush_fn) (WalProposerConn* conn, bool socket_read_ready);
+typedef int (*walprop_flush_fn) (WalProposerConn* conn);
 
 /* Re-exported PQfinish */
 typedef void (*walprop_finish_fn) (WalProposerConn* conn);
@@ -545,8 +531,8 @@ typedef struct WalProposerFunctionsType
 	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
 #define walprop_socket(conn) \
 	WalProposerFunctions->walprop_socket(conn)
-#define walprop_flush(conn, consume_input) \
-	WalProposerFunctions->walprop_flush(conn, consume_input)
+#define walprop_flush(conn) \
+	WalProposerFunctions->walprop_flush(conn)
 #define walprop_finish(conn) \
 	WalProposerFunctions->walprop_finish(conn)
 #define walprop_async_read(conn, buf, amount) \

From b29a4a8d71654d556c7969cb83c4e309f00e12c5 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Wed, 22 Dec 2021 18:47:38 +0300
Subject: [PATCH 083/214] Fix walsender to work with zenith style standbyReply
 that sends non-zero flushLsn. Clean up backpressure defaults.

---
 src/backend/replication/walproposer.c | 3 +--
 src/backend/replication/walsender.c   | 6 ++++--
 src/backend/utils/misc/guc.c          | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 99a77aba280..95b9c0ae32d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -406,8 +406,7 @@ HandleWalKeeperResponse(void)
 		/* advance the replication slot */
 		if (!syncSafekeepers)
 			ProcessStandbyReply(
-								// write_lsn
-								// Not used, because we use SYNCHRONOUS_COMMIT_REMOTE_FLUSH.
+								// write_lsn -  This is what durably stored in WAL service.
 								lastFeedback.flushLsn,
 								//flush_lsn - This is what durably stored in WAL service.
 								lastFeedback.flushLsn,
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index c3b765e120e..2f834301cd0 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3019,8 +3019,8 @@ WalSndDone(WalSndSendDataCallback send_data)
 	 * flush location if valid, write otherwise. Tools like pg_receivewal will
 	 * usually (unless in synchronous mode) return an invalid flush location.
 	 */
-	replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ?
-		MyWalSnd->write : MyWalSnd->flush;
+	// XXX Zenith uses flush_lsn to pass extra payload, so use write_lsn here
+	replicatedPtr = MyWalSnd->write;
 
 	if (WalSndCaughtUp && sentPtr == replicatedPtr &&
 		!pq_is_send_pending())
@@ -3832,12 +3832,14 @@ backpressure_lag(void)
 			LSN_FORMAT_ARGS(applyPtr));
 
 		if ((flushPtr != UnknownXLogRecPtr
+			&& max_replication_flush_lag > 0
 			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
 		{
 			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
 		}
 
 		if ((applyPtr != UnknownXLogRecPtr
+			&& max_replication_apply_lag > 0
 			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
 		{
 			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9158f475dbc..fb2c5fa67ea 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2920,7 +2920,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_MB,
 		},
 		&max_replication_apply_lag,
-		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 
@@ -2932,7 +2932,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_MB,
 		},
 		&max_replication_flush_lag,
-		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 

From 99ee0159ca939c0c7362e3ff6fbc147c3afd5d25 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Sat, 25 Dec 2021 19:42:14 +0300
Subject: [PATCH 084/214] Do not copy the obsolete apply_conf binary into
 Docker image

---
 Dockerfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4878e3cc755..496228cabcd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Image with pre-built tools
 #
 FROM zenithdb/compute-tools:latest AS compute-deps
-# Only to get ready zenith_ctl and apply_conf binaries as deps
+# Only to get ready zenith_ctl binary as deppendency
 
 #
 # Image with Postgres build deps
@@ -56,7 +56,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
 
 # Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/apply_conf /usr/local/bin/apply_conf
 COPY --from=compute-deps /usr/local/bin/zenith_ctl /usr/local/bin/zenith_ctl
 
 # Add postgres shared objects to the search path

From acb3658018a63a11d10214a3c29ce24db030b17b Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 31 Dec 2021 12:57:46 +0300
Subject: [PATCH 085/214] Reorder walproposer code in a more natural order
 (#112)

Now functions in walproposer.c go in chronological order
---
 src/backend/replication/walproposer.c | 2945 +++++++++++++------------
 1 file changed, 1488 insertions(+), 1457 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 95b9c0ae32d..b307c79177d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -60,6 +60,10 @@ char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
 bool		am_wal_proposer;
 
+char	   *zenith_timeline_walproposer = NULL;
+char	   *zenith_tenant_walproposer = NULL;
+char	   *zenith_pageserver_connstring_walproposer = NULL;
+
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
 WalProposerFunctionsType *WalProposerFunctions = NULL;
 
@@ -95,171 +99,412 @@ static TimestampTz last_reconnect_attempt;
 /* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
 static bool syncSafekeepers;
 
-/* Declarations of a few functions ahead of time, so that we can define them out of order. */
+/* Prototypes for private functions */
+static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStart(void);
+static void WalProposerLoop(void);
+static void InitEventSet(void);
+static void UpdateEventSet(WalKeeper *wk, uint32 events);
+static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
+static void ShutdownConnection(WalKeeper *wk);
+static void ResetConnection(WalKeeper *wk);
+static long TimeToReconnect(TimestampTz now);
+static void ReconnectWalKeepers(void);
 static void AdvancePollState(int i, uint32 events);
+static term_t GetHighestTerm(TermHistory *th);
+static term_t GetEpoch(WalKeeper *wk);
+static void DetermineEpochStartLsn(void);
+static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+static void SendProposerElected(WalKeeper *wk);
+static void WalProposerStartStreaming(XLogRecPtr startpos);
+static void StartStreaming(WalKeeper *wk);
+static void SendMessageToNode(int i, WalMessage *msg);
+static void BroadcastMessage(WalMessage *msg);
+static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
+static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static bool SendAppendRequests(WalKeeper *wk);
+static bool RecvAppendResponses(WalKeeper *wk);
+static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
+static XLogRecPtr CalculateDiskConsistentLsn(void);
+static XLogRecPtr CalculateMinFlushLsn(void);
+static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
+static void HandleWalKeeperResponse(void);
 static bool AsyncRead(int i, char **buf, int *buf_size);
 static bool AsyncReadFixed(int i, void *value, size_t value_size);
 static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
 static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
 static bool AsyncFlush(WalKeeper *wk);
-static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
-static void BroadcastMessage(WalMessage *msg);
-static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
-static term_t GetHighestTerm(TermHistory *th);
-static term_t GetEpoch(WalKeeper *wk);
-static void SendProposerElected(WalKeeper *wk);
-static void StartStreaming(WalKeeper *wk);
-static bool SendAppendRequests(WalKeeper *wk);
-
 
 /*
- * Combine hot standby feedbacks from all walkeepers.
+ * WAL proposer bgworker entry point.
  */
-static void
-CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
+void
+WalProposerMain(Datum main_arg)
 {
-	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
 
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].feedback.hs.ts != 0)
-		{
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
-			{
-				hs->xmin = walkeeper[i].feedback.hs.xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
-			}
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
-			{
-				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
-			}
-		}
-	}
-}
+	BackgroundWorkerUnblockSignals();
 
-/*
- * Get minimum of disk consistent LSNs of all safekeepers
- */
-static XLogRecPtr
-CalculateDiskConsistentLsn(void)
-{
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
+
+	last_reconnect_attempt = GetCurrentTimestamp();
+
+	application_name = (char *) "walproposer";	/* for
+												 * synchronous_standby_names */
+	am_wal_proposer = true;
+	am_walsender = true;
+	InitWalSender();
+
+	/* Create replication slot for WAL proposer if not exists */
+	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
-		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
-		{
-			lsn = walkeeper[i].feedback.diskConsistentLsn;
-		}
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotReserveWal();
+		/* Write this slot to disk */
+		ReplicationSlotMarkDirty();
+		ReplicationSlotSave();
+		ReplicationSlotRelease();
 	}
-	return lsn;
+
+	WalProposerStart();
 }
 
 /*
- * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
- * last WAL record that can be safely discarded.
+ * Entry point for `postgres --sync-safekeepers`.
  */
-static XLogRecPtr
-CalculateMinFlushLsn(void)
+void
+WalProposerSync(int argc, char *argv[])
 {
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].feedback.flushLsn < lsn)
-			lsn = walkeeper[i].feedback.flushLsn;
-	}
-	return lsn;
-}
+	syncSafekeepers = true;
 
-/* Initializes the internal event set, provided that it is currently null */
-static void
-InitEventSet(void)
-{
-	if (waitEvents)
-		elog(FATAL, "double-initialization of event set");
+	InitStandaloneProcess(argv[0]);
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
-	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	WalProposerInit(0, 0);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
 }
 
 /*
- * Updates the events we're already waiting on for the WAL keeper, setting it to
- * the provided `events`
- *
- * This function is called any time the WAL keeper's state switches to one where
- * it has to wait to continue. This includes the full body of AdvancePollState
- * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
+ * Create new AppendRequest message and start sending it. This function is
+ * called from walsender every time the new WAL is available.
  */
-static void
-UpdateEventSet(WalKeeper *wk, uint32 events)
+void
+WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
 {
-	/* eventPos = -1 when we don't have an event */
-	Assert(wk->eventPos != -1);
+	WalMessage *msg = CreateMessage(startpos, data, len);
 
-	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+	if (msg != NULL)
+		BroadcastMessage(msg);
 }
 
-/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
- *
- * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+/*
+ * Advance the WAL proposer state machine, waiting each time for events to occur.
+ * Will exit only when latch is set, i.e. new WAL should be pushed from walsender
+ * to walproposer.
  */
-static void
-HackyRemoveWalProposerEvent(WalKeeper *to_remove)
+void
+WalProposerPoll(void)
 {
-	/* Remove the existing event set */
-	if (waitEvents)
+	while (true)
 	{
-		FreeWaitEventSet(waitEvents);
-		waitEvents = NULL;
-	}
-	/* Re-initialize it without adding any walkeeper events */
-	InitEventSet();
+		WalKeeper  *wk;
+		int			rc;
+		int			i;
+		WaitEvent	event;
+		TimestampTz now = GetCurrentTimestamp();
 
-	/*
-	 * loop through the existing walkeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		uint32		desired_events = WL_NO_EVENTS;
-		WalKeeper  *wk = &walkeeper[i];
+		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		wk = (WalKeeper *) event.user_data;
+		i = (int) (wk - walkeeper);
 
-		wk->eventPos = -1;
+		/*
+		 * If the event contains something that one of our walkeeper states
+		 * was waiting for, we'll advance its state.
+		 */
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
+			AdvancePollState(i, event.events);
 
-		if (wk == to_remove)
-			continue;
+		/*
+		 * If the timeout expired, attempt to reconnect to any walkeepers that
+		 * we dropped
+		 */
+		ReconnectWalKeepers();
 
-		/* If this WAL keeper isn't offline, add an event for it! */
-		if (wk->conn != NULL)
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (rc != 0 && (event.events & WL_LATCH_SET))
 		{
-			desired_events = WalKeeperStateDesiredEvents(wk->state);
-			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
+			ResetLatch(MyLatch);
+			break;
+		}
+		if (rc == 0) /* timeout expired: poll state */
+		{
+			/*
+			 * If no WAL was generated during timeout (and we have already
+			 * collected the quorum), then send pool message
+			 */
+			if (lastSentLsn != InvalidXLogRecPtr)
+			{
+				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+			}
 		}
 	}
 }
 
-/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
-static void
-ShutdownConnection(WalKeeper *wk)
+/*
+ * Register a background worker proposing WAL to wal acceptors.
+ */
+void
+WalProposerRegister(void)
 {
-	if (wk->conn)
-		walprop_finish(wk->conn);
-	wk->conn = NULL;
-	wk->state = SS_OFFLINE;
-	wk->flushWrite = false;
-	wk->currMsg = NULL;
-	wk->ackMsg = NULL;
+	BackgroundWorker bgw;
 
-	if (wk->voteResponse.termHistory.entries)
-		pfree(wk->voteResponse.termHistory.entries);
-	wk->voteResponse.termHistory.entries = NULL;
+	if (*wal_acceptors_list == '\0')
+		return;
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+static void
+WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
+{
+	char	   *host;
+	char	   *sep;
+	char	   *port;
+
+	/* Load the libpq-specific functions */
+	load_file("libpqwalproposer", false);
+	if (WalProposerFunctions == NULL)
+		elog(ERROR, "libpqwalproposer didn't initialize correctly");
+
+	load_file("libpqwalreceiver", false);
+	if (WalReceiverFunctions == NULL)
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+	load_file("zenith", false);
+
+	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
+	{
+		port = strchr(host, ':');
+		if (port == NULL)
+		{
+			elog(FATAL, "port is not specified");
+		}
+		*port++ = '\0';
+		sep = strchr(port, ',');
+		if (sep != NULL)
+			*sep++ = '\0';
+		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
+		{
+			elog(FATAL, "Too many walkeepers");
+		}
+		walkeeper[n_walkeepers].host = host;
+		walkeeper[n_walkeepers].port = port;
+		walkeeper[n_walkeepers].state = SS_OFFLINE;
+		walkeeper[n_walkeepers].conn = NULL;
+
+		/*
+		 * Set conninfo to empty. We'll fill it out once later, in
+		 * `ResetConnection` as needed
+		 */
+		walkeeper[n_walkeepers].conninfo[0] = '\0';
+		initStringInfo(&walkeeper[n_walkeepers].outbuf);
+		walkeeper[n_walkeepers].flushWrite = false;
+		walkeeper[n_walkeepers].currMsg = NULL;
+		walkeeper[n_walkeepers].ackMsg = NULL;
+		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
+		n_walkeepers += 1;
+	}
+	if (n_walkeepers < 1)
+	{
+		elog(FATAL, "WalKeepers addresses are not specified");
+	}
+	quorum = n_walkeepers / 2 + 1;
+
+	/* Fill the greeting package */
+	proposerGreeting.tag = 'g';
+	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
+	proposerGreeting.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
+	proposerGreeting.systemId = systemId;
+	if (!zenith_timeline_walproposer)
+		elog(FATAL, "zenith.zenith_timeline is not provided");
+	if (*zenith_timeline_walproposer != '\0' &&
+		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+	if (!zenith_tenant_walproposer)
+		elog(FATAL, "zenith.zenith_tenant is not provided");
+	if (*zenith_tenant_walproposer != '\0' &&
+		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+
+	proposerGreeting.timeline = ThisTimeLineID;
+	proposerGreeting.walSegSize = wal_segment_size;
+
+	InitEventSet();
+}
+
+static void
+WalProposerStart(void)
+{
+
+	/* Initiate connections to all walkeeper nodes */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		ResetConnection(&walkeeper[i]);
+	}
+
+	WalProposerLoop();
+}
+
+static void
+WalProposerLoop(void)
+{
+	while (true)
+		WalProposerPoll();
+}
+
+/* Initializes the internal event set, provided that it is currently null */
+static void
+InitEventSet(void)
+{
+	if (waitEvents)
+		elog(FATAL, "double-initialization of event set");
+
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
+	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+}
+
+/*
+ * Updates the events we're already waiting on for the WAL keeper, setting it to
+ * the provided `events`
+ *
+ * This function is called any time the WAL keeper's state switches to one where
+ * it has to wait to continue. This includes the full body of AdvancePollState
+ * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
+ */
+static void
+UpdateEventSet(WalKeeper *wk, uint32 events)
+{
+	/* eventPos = -1 when we don't have an event */
+	Assert(wk->eventPos != -1);
+
+	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+}
+
+/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+ */
+static void
+HackyRemoveWalProposerEvent(WalKeeper *to_remove)
+{
+	/* Remove the existing event set */
+	if (waitEvents)
+	{
+		FreeWaitEventSet(waitEvents);
+		waitEvents = NULL;
+	}
+	/* Re-initialize it without adding any walkeeper events */
+	InitEventSet();
+
+	/*
+	 * loop through the existing walkeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		uint32		desired_events = WL_NO_EVENTS;
+		WalKeeper  *wk = &walkeeper[i];
+
+		wk->eventPos = -1;
+
+		if (wk == to_remove)
+			continue;
+
+		/* If this WAL keeper isn't offline, add an event for it! */
+		if (wk->conn != NULL)
+		{
+			desired_events = WalKeeperStateDesiredEvents(wk->state);
+			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
+		}
+	}
+}
+
+/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
+static void
+ShutdownConnection(WalKeeper *wk)
+{
+	if (wk->conn)
+		walprop_finish(wk->conn);
+	wk->conn = NULL;
+	wk->state = SS_OFFLINE;
+	wk->flushWrite = false;
+	wk->currMsg = NULL;
+	wk->ackMsg = NULL;
+
+	if (wk->voteResponse.termHistory.entries)
+		pfree(wk->voteResponse.termHistory.entries);
+	wk->voteResponse.termHistory.entries = NULL;
 
 	HackyRemoveWalProposerEvent(wk);
 }
@@ -357,1495 +602,1307 @@ ResetConnection(WalKeeper *wk)
 }
 
 /*
- * Calculate WAL position acknowledged by quorum
+ * How much milliseconds left till we should attempt reconnection to
+ * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
+ * (do we actually need this?).
  */
-static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(void)
+static long
+TimeToReconnect(TimestampTz now)
 {
-	XLogRecPtr	responses[MAX_WALKEEPERS];
+	TimestampTz passed;
+	TimestampTz till_reconnect;
 
-	/*
-	 * Sort acknowledged LSNs
-	 */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		/*
-		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
-		 */
-		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
-			walkeeper[i].feedback.flushLsn : 0;
-	}
-	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
+	if (wal_acceptor_reconnect_timeout <= 0)
+		return -1;
 
-	/*
-	 * Get the smallest LSN committed by quorum
-	 */
-	return responses[n_walkeepers - quorum];
+	passed = now - last_reconnect_attempt;
+	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
+	if (till_reconnect <= 0)
+		return 0;
+	return (long) (till_reconnect / 1000);
 }
 
+/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
 static void
-HandleWalKeeperResponse(void)
+ReconnectWalKeepers(void)
 {
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	minQuorumLsn;
-	XLogRecPtr	diskConsistentLsn;
-	XLogRecPtr  minFlushLsn;
-
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = CalculateDiskConsistentLsn();
+	TimestampTz now = GetCurrentTimestamp();
 
-	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
+	if (TimeToReconnect(now) == 0)
 	{
-
-		if (minQuorumLsn > lastFeedback.flushLsn)
-			lastFeedback.flushLsn = minQuorumLsn;
-
-		lastFeedback.diskConsistentLsn = diskConsistentLsn;
-
-		/* advance the replication slot */
-		if (!syncSafekeepers)
-			ProcessStandbyReply(
-								// write_lsn -  This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
-								//flush_lsn - This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
-								//apply_lsn - This is what processed and durably saved at pageserver.
-								lastFeedback.diskConsistentLsn,
-								GetCurrentTimestamp(), false);
-	}
-
-	CombineHotStanbyFeedbacks(&hsFeedback);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
-	{
-		lastFeedback.hs = hsFeedback;
-		if (!syncSafekeepers)
-			ProcessStandbyHSFeedback(hsFeedback.ts,
-									 XidFromFullTransactionId(hsFeedback.xmin),
-									 EpochFromFullTransactionId(hsFeedback.xmin),
-									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
-	}
-
-	/*
-	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the 
-	 * beginning of the record, which simplifies decoding on the far end.
-	 *
-	 * Advanced truncateLsn should be not further than nearest commitLsn.
-	 * This prevents surprising violation of truncateLsn <= commitLsn
-	 * invariant which might occur because 1) truncateLsn can be advanced
-	 * immediately once chunk is broadcast to all safekeepers, and
-	 * commitLsn generally can't be advanced based on feedback from
-	 * safekeeper who is still in the previous epoch (similar to 'leader
-	 * can't commit entries from previous term' in Raft); 2) chunks we
-	 * read from WAL and send are plain sheets of bytes, but safekeepers
-	 * ack only on record boundaries.
-	 */
-	minFlushLsn = CalculateMinFlushLsn();
-	if (minFlushLsn > truncateLsn)
-		truncateLsn = minFlushLsn;
-
-	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
-	{
-		WalMessage *msg = msgQueueHead;
-		msgQueueHead = msg->next;
-
-		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
-		free(msg);
-	}
-	if (!msgQueueHead)			/* queue is empty */
-		msgQueueTail = NULL;
-	/* truncateLsn always points to the first chunk in the queue */
-	if (msgQueueHead)
-	{
-		/* Max takes care of special 0-sized messages */
-		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
-			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
-	}
-
-	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
-	 * basically the random one gets connected, to prevent hanging basebackup
-	 * (due to pageserver connecting to not-synced-walkeeper) we currently
-	 * wait for all seemingly alive walkeepers to get synced.
-	 */
-	if (syncSafekeepers)
-	{
-		int			n_synced;
-
-		n_synced = 0;
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			WalKeeper  *wk = &walkeeper[i];
-			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
-
-			/* alive safekeeper which is not synced yet; wait for it */
-			if (wk->state != SS_OFFLINE && !synced)
-				return;
-			if (synced)
-				n_synced++;
-		}
-		if (n_synced >= quorum)
-		{
-			/* All walkeepers synced! */
-			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-			exit(0);
-		}
-	}
-}
-
-char	   *zenith_timeline_walproposer = NULL;
-char	   *zenith_tenant_walproposer = NULL;
-char	   *zenith_pageserver_connstring_walproposer = NULL;
-
-
-static void
-WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
-{
-	char	   *host;
-	char	   *sep;
-	char	   *port;
-
-	/* Load the libpq-specific functions */
-	load_file("libpqwalproposer", false);
-	if (WalProposerFunctions == NULL)
-		elog(ERROR, "libpqwalproposer didn't initialize correctly");
-
-	load_file("libpqwalreceiver", false);
-	if (WalReceiverFunctions == NULL)
-		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-	load_file("zenith", false);
-
-	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
-	{
-		port = strchr(host, ':');
-		if (port == NULL)
-		{
-			elog(FATAL, "port is not specified");
-		}
-		*port++ = '\0';
-		sep = strchr(port, ',');
-		if (sep != NULL)
-			*sep++ = '\0';
-		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
-		{
-			elog(FATAL, "Too many walkeepers");
-		}
-		walkeeper[n_walkeepers].host = host;
-		walkeeper[n_walkeepers].port = port;
-		walkeeper[n_walkeepers].state = SS_OFFLINE;
-		walkeeper[n_walkeepers].conn = NULL;
-
-		/*
-		 * Set conninfo to empty. We'll fill it out once later, in
-		 * `ResetConnection` as needed
-		 */
-		walkeeper[n_walkeepers].conninfo[0] = '\0';
-		initStringInfo(&walkeeper[n_walkeepers].outbuf);
-		walkeeper[n_walkeepers].flushWrite = false;
-		walkeeper[n_walkeepers].currMsg = NULL;
-		walkeeper[n_walkeepers].ackMsg = NULL;
-		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
-		n_walkeepers += 1;
-	}
-	if (n_walkeepers < 1)
-	{
-		elog(FATAL, "WalKeepers addresses are not specified");
-	}
-	quorum = n_walkeepers / 2 + 1;
-
-	/* Fill the greeting package */
-	proposerGreeting.tag = 'g';
-	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
-	proposerGreeting.pgVersion = PG_VERSION_NUM;
-	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
-	proposerGreeting.systemId = systemId;
-	if (!zenith_timeline_walproposer)
-		elog(FATAL, "zenith.zenith_timeline is not provided");
-	if (*zenith_timeline_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
-	if (!zenith_tenant_walproposer)
-		elog(FATAL, "zenith.zenith_tenant is not provided");
-	if (*zenith_tenant_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
-
-	proposerGreeting.timeline = ThisTimeLineID;
-	proposerGreeting.walSegSize = wal_segment_size;
-
-	InitEventSet();
-}
-
-static void
-WalProposerLoop(void)
-{
-	while (true)
-		WalProposerPoll();
-}
-
-static void
-WalProposerStart(void)
-{
-
-	/* Initiate connections to all walkeeper nodes */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		ResetConnection(&walkeeper[i]);
-	}
-
-	WalProposerLoop();
-}
-
-/*
- * WAL proposer bgworeker entry point
- */
-void
-WalProposerMain(Datum main_arg)
-{
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	GetXLogReplayRecPtr(&ThisTimeLineID);
-
-	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
-
-	last_reconnect_attempt = GetCurrentTimestamp();
-
-	application_name = (char *) "walproposer";	/* for
-												 * synchronous_standby_names */
-	am_wal_proposer = true;
-	am_walsender = true;
-	InitWalSender();
-
-	/* Create replication slot for WAL proposer if not exists */
-	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
-	{
-		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
-		ReplicationSlotReserveWal();
-		/* Write this slot to disk */
-		ReplicationSlotMarkDirty();
-		ReplicationSlotSave();
-		ReplicationSlotRelease();
-	}
-
-	WalProposerStart();
-}
-
-void
-WalProposerSync(int argc, char *argv[])
-{
-	syncSafekeepers = true;
-
-	InitStandaloneProcess(argv[0]);
-
-	SetProcessingMode(InitProcessing);
-
-	/*
-	 * Set default values for command-line options.
-	 */
-	InitializeGUCOptions();
-
-	/* Acquire configuration parameters */
-	if (!SelectConfigFiles(NULL, progname))
-		exit(1);
-
-	/*
-	 * Imitate we are early in bootstrap loading shared_preload_libraries;
-	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
-	 */
-	process_shared_preload_libraries_in_progress = true;
-
-	/*
-	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
-	 *
-	 * Copied from InitPostmasterDeathWatchHandle()
-	 */
-	if (pipe(postmaster_alive_fds) < 0)
-		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
-	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
-		ereport(FATAL,
-				(errcode_for_socket_access(),
-				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
-
-	WalProposerInit(0, 0);
-
-	process_shared_preload_libraries_in_progress = false;
-
-	BackgroundWorkerUnblockSignals();
-
-	WalProposerStart();
-}
-
-static void
-WalProposerStartStreaming(XLogRecPtr startpos)
-{
-	StartReplicationCmd cmd;
-
-	elog(LOG, "WAL proposer starts streaming at %X/%X",
-		 LSN_FORMAT_ARGS(startpos));
-	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = proposerGreeting.timeline;
-	cmd.startpoint = startpos;
-	StartReplication(&cmd);
-}
+		last_reconnect_attempt = now;
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			if (walkeeper[i].state == SS_OFFLINE)
+				ResetConnection(&walkeeper[i]);
+		}
+	}
+}
 
 /*
- * Start sending message to the particular node.
- *
- * Always updates the state and event set for the WAL keeper; setting either of
- * these before calling would be redundant work.
+ * Performs the logic for advancing the state machine of the 'i'th walkeeper,
+ * given that a certain set of events has occured.
  */
 static void
-SendMessageToNode(int i, WalMessage *msg)
+AdvancePollState(int i, uint32 events)
 {
 	WalKeeper  *wk = &walkeeper[i];
-
-	/* we shouldn't be already sending something */
-	Assert(wk->currMsg == NULL);
-
 	/*
-	 * Skip already acknowledged messages. Used after reconnection to get to
-	 * the first not yet sent message. Otherwise we always just send 'msg'.
+	 * Keep advancing the state while either: (a) the event is still
+	 * unprocessed (usually because it's the first iteration of the loop), or
+	 * (b) the state can execute, and does not need to wait for any socket
+	 * events
 	 */
-	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
-		msg = msg->next;
+	while (events || StateShouldImmediatelyExecute(wk->state))
+	{
+		/*
+		 * Sanity check. We assume further down that the operations don't
+		 * block because the socket is ready.
+		 */
+		AssertEventsOkForState(events, wk);
 
-	wk->currMsg = msg;
-	wk->flushWrite = false;
+		/* Execute the code corresponding to the current state */
+		switch (wk->state)
+		{
+				/*
+				 * WAL keepers are only taken out of SS_OFFLINE by calls to
+				 * ResetConnection
+				 */
+			case SS_OFFLINE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
+					 wk->host, wk->port);
+				break;			/* actually unreachable, but prevents
+								 * -Wimplicit-fallthrough */
+
+				/*
+				 * Both connecting states run the same logic. The only
+				 * difference is the events they're expecting
+				 */
+			case SS_CONNECTING_READ:
+			case SS_CONNECTING_WRITE:
+				{
+					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
 
-	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	if (!SendAppendRequests(wk))
-		return;
-}
+					/* The new set of events we'll wait on, after updating */
+					uint32		new_events = WL_NO_EVENTS;
 
-/*
- * Broadcast new message to all caught-up walkeepers
- */
-static void
-BroadcastMessage(WalMessage *msg)
-{
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
-		{
-			SendMessageToNode(i, msg);
-		}
-	}
-}
+					switch (result)
+					{
+						case WP_CONN_POLLING_OK:
+							elog(LOG, "connected with node %s:%s", wk->host,
+								 wk->port);
 
-static WalMessage *
-CreateMessage(XLogRecPtr startpos, char *data, int len)
-{
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
-	XLogRecPtr	endpos;
+							/*
+							 * Once we're fully connected, we can move to the
+							 * next state
+							 */
+							wk->state = SS_EXEC_STARTWALPUSH;
 
-	len -= XLOG_HDR_SIZE;
-	endpos = startpos + len;
-	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
-	{
-		/* Message already queued */
-		return NULL;
-	}
-	Assert(len >= 0);
-	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
+							/*
+							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
+							 * on anything, we do need to replace the current
+							 * event, so we have to just pick something. We'll
+							 * eventually need the socket to be readable, so
+							 * we go with that.
+							 */
+							new_events = WL_SOCKET_READABLE;
+							break;
 
-	msg->size = sizeof(AppendRequestHeader) + len;
-	msg->next = NULL;
-	msg->ackMask = 0;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
-	msg->req.beginLsn = startpos;
-	msg->req.endLsn = endpos;
-	msg->req.proposerId = proposerGreeting.proposerId;
-	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
+							/*
+							 * If we need to poll to finish connecting,
+							 * continue doing that
+							 */
+						case WP_CONN_POLLING_READING:
+							wk->state = SS_CONNECTING_READ;
+							new_events = WL_SOCKET_READABLE;
+							break;
+						case WP_CONN_POLLING_WRITING:
+							wk->state = SS_CONNECTING_WRITE;
+							new_events = WL_SOCKET_WRITEABLE;
+							break;
 
-	Assert(msg->req.endLsn >= lastSentLsn);
-	lastSentLsn = msg->req.endLsn;
-	return msg;
-}
+						case WP_CONN_POLLING_FAILED:
+							elog(WARNING, "Failed to connect to node '%s:%s': %s",
+								 wk->host, wk->port, walprop_error_message(wk->conn));
 
-void
-WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
-{
-	WalMessage *msg = CreateMessage(startpos, data, len);
+							/*
+							 * If connecting failed, we don't want to restart
+							 * the connection because that might run us into a
+							 * loop. Instead, shut it down -- it'll naturally
+							 * restart at a slower interval on calls to
+							 * ReconnectWalKeepers.
+							 */
+							ShutdownConnection(wk);
+							return;
+					}
 
-	if (msg != NULL)
-		BroadcastMessage(msg);
-}
+					/*
+					 * Because PQconnectPoll can change the socket, we have to
+					 * un-register the old event and re-register an event on
+					 * the new socket.
+					 */
+					HackyRemoveWalProposerEvent(wk);
+					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
+					break;
+				}
 
-/*
- * Create WAL message with no data, just to let the walkeepers
- * know that commit lsn has advanced.
- */
-static WalMessage *
-CreateMessageCommitLsnOnly(XLogRecPtr lsn)
-{
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
+				/*
+				 * Send "START_WAL_PUSH" command to the walkeeper. After
+				 * sending, wait for response with SS_WAIT_EXEC_RESULT
+				 */
+			case SS_EXEC_STARTWALPUSH:
+				{
+					char *query = NULL;
+					if (zenith_pageserver_connstring_walproposer != NULL) {
+						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
+					} else {
+						query = psprintf("START_WAL_PUSH");
+					}
+					if (!walprop_send_query(wk->conn, query))
+					{
+						pfree(query);
+						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+						ShutdownConnection(wk);
+						return;
+					}
+					pfree(query);
+					wk->state = SS_WAIT_EXEC_RESULT;
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+					break;
+				}
 
-	msg = (WalMessage *) malloc(sizeof(WalMessage));
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
+			case SS_WAIT_EXEC_RESULT:
+				switch (walprop_get_query_result(wk->conn))
+				{
+						/*
+						 * Successful result, move on to starting the
+						 * handshake
+						 */
+					case WP_EXEC_SUCCESS_COPYBOTH:
 
-	msg->size = sizeof(AppendRequestHeader);
-	msg->next = NULL;
-	msg->ackMask = 0;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
+						/*
+						 * Because this state is immediately executable, we'll
+						 * start this on the next iteration of the loop
+						 */
+						wk->state = SS_HANDSHAKE_SEND;
+						break;
 
-	/*
-	 * This serves two purposes: 1) After all msgs from previous epochs are
-	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
-	 * commands to switch the epoch, which allows to do the switch without
-	 * creating new epoch records (we especially want to avoid such in --sync
-	 * mode). Walproposer can advance commit_lsn only after the switch, so
-	 * this lsn (reported back) also is the first possible advancement point.
-	 * 2) Maintain common invariant of queue entries sorted by LSN.
-	 */
-	msg->req.beginLsn = lsn;
-	msg->req.endLsn = lsn;
-	msg->req.proposerId = proposerGreeting.proposerId;
+						/*
+						 * Needs repeated calls to finish. Wait until the
+						 * socket is readable
+						 */
+					case WP_EXEC_NEEDS_INPUT:
 
-	/*
-	 * truncateLsn and commitLsn are set just before the message sent, in
-	 * SendMessageToNode()
-	 */
-	return msg;
-}
+						/*
+						 * SS_WAIT_EXEC_RESULT is always reached through an
+						 * event, so we don't need to update the event set
+						 */
+						break;
 
-/* latest term in TermHistory, or 0 is there is no entries */
-static term_t
-GetHighestTerm(TermHistory *th)
-{
-	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
-}
+					case WP_EXEC_FAILED:
+						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
+							 wk->host, wk->port, walprop_error_message(wk->conn));
+						ShutdownConnection(wk);
+						return;
 
-/* safekeeper's epoch is the term of the highest entry in the log */
-static term_t
-GetEpoch(WalKeeper *wk)
-{
-	return GetHighestTerm(&wk->voteResponse.termHistory);
-}
+						/*
+						 * Unexpected result -- funamdentally an error, but we
+						 * want to produce a custom message, rather than a
+						 * generic "something went wrong"
+						 */
+					case WP_EXEC_UNEXPECTED_SUCCESS:
+						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
+							 wk->host, wk->port);
+						ShutdownConnection(wk);
+						return;
+				}
+				break;
 
-/*
- * Called after majority of acceptors gave votes, it calculates the most
- * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
- * which we'll write WAL in our term.
- *
- * Sets truncateLsn along the way (though it is not of much use at this point --
- * only for skipping recovery).
- */
-static void
-DetermineEpochStartLsn(void)
-{
-	TermHistory *dth;
+				/*
+				 * Start handshake: first of all send information about the
+				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+				 * a response to finish the handshake.
+				 */
+			case SS_HANDSHAKE_SEND:
 
-	propEpochStartLsn = InvalidXLogRecPtr;
-	donorEpoch = 0;
-	truncateLsn = InvalidXLogRecPtr;
+				/*
+				 * On failure, logging & resetting the connection is handled.
+				 * We just need to handle the control flow.
+				 */
+				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
+					return;
 
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].state == SS_IDLE)
-		{
-			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
-				(GetEpoch(&walkeeper[i]) == donorEpoch &&
-				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
-			{
-				donorEpoch = GetEpoch(&walkeeper[i]);
-				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
-				donor = i;
-			}
-			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
-		}
-	}
+				break;
 
-	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
-	 * was committed yet. To keep the idea of always starting streaming since
-	 * record boundary (which simplifies decoding on safekeeper), take start
-	 * position of the slot.
-	 */
-	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
-	{
-		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
-		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
-		ReplicationSlotRelease();
-		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
-	}
+				/*
+				 * Finish handshake comms: receive information about the WAL
+				 * keeper
+				 */
+			case SS_HANDSHAKE_RECV:
 
-	/*
-	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
-	 * some connected safekeeper; it must have carried truncateLsn pointing to
-	 * the first record.
-	 */
-	Assert((truncateLsn != InvalidXLogRecPtr) ||
-		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
+				/*
+				 * If our reading doesn't immediately succeed, any necessary
+				 * error handling or state setting is taken care of. We can
+				 * leave any other work until later.
+				 */
+				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
+					return;
 
-	/*
-	 * Proposer's term history is the donor's + its own entry.
-	 */
-	dth = &walkeeper[donor].voteResponse.termHistory;
-	propTermHistory.n_entries = dth->n_entries + 1;
-	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
-	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
-	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
-	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
+				/* Protocol is all good, move to voting. */
+				wk->state = SS_VOTING;
 
-	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		 quorum,
-		 propTerm,
-		 LSN_FORMAT_ARGS(propEpochStartLsn),
-		 walkeeper[donor].host, walkeeper[donor].port,
-		 LSN_FORMAT_ARGS(truncateLsn)
-		);
-}
+				/*
+				 * Don't need to update the event set yet. Either we update
+				 * the event set to WL_SOCKET_READABLE *or* we change the
+				 * state to SS_SEND_VOTE in the loop below
+				 */
+				UpdateEventSet(wk, WL_SOCKET_READABLE);
+				wk->feedback.flushLsn = truncateLsn;
+				wk->feedback.hs.ts = 0;
 
-/*
- * How much milliseconds left till we should attempt reconnection to
- * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
- * (do we actually need this?).
- */
-static long
-TimeToReconnect(TimestampTz now)
-{
-	TimestampTz passed;
-	TimestampTz till_reconnect;
+				/*
+				 * We want our term to be highest and unique, so choose max
+				 * and +1 once we have majority.
+				 */
+				propTerm = Max(walkeeper[i].greet.term, propTerm);
 
-	if (wal_acceptor_reconnect_timeout <= 0)
-		return -1;
+				/*
+				 * Check if we have quorum. If there aren't enough walkeepers,
+				 * wait and do nothing. We'll eventually get a task when the
+				 * election starts.
+				 *
+				 * If we do have quorum, we can start an election
+				 */
+				if (++n_connected < quorum)
+				{
+					/*
+					 * SS_VOTING is an idle state; read-ready indicates the
+					 * connection closed.
+					 */
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+				}
+				else
+				{
+					if (n_connected == quorum)
+					{
+						propTerm++;
+						/* prepare voting message */
+						voteRequest = (VoteRequest)
+						{
+							.tag = 'v',
+								.term = propTerm
+						};
+						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+					}
 
-	passed = now - last_reconnect_attempt;
-	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
-	if (till_reconnect <= 0)
-		return 0;
-	return (long) (till_reconnect / 1000);
-}
+					/*
+					 * Now send voting request to the cohort and wait
+					 * responses
+					 */
+					for (int j = 0; j < n_walkeepers; j++)
+					{
+						/*
+						 * Remember: SS_VOTING indicates that the walkeeper is
+						 * participating in voting, but hasn't sent anything
+						 * yet. The ones that have sent something are given
+						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
+						 */
+						if (walkeeper[j].state == SS_VOTING)
+						{
+							walkeeper[j].state = SS_SEND_VOTE;
+							/* Immediately send info */
+							AdvancePollState(j, WL_NO_EVENTS);
+						}
+					}
+				}
+				break;
 
-/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
-static void
-ReconnectWalKeepers(void)
-{
-	TimestampTz now = GetCurrentTimestamp();
+				/*
+				 * Voting is an idle state - we don't expect any events to
+				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
+				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
+				 */
+			case SS_VOTING:
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(wk);
+				break;
 
-	if (TimeToReconnect(now) == 0)
-	{
-		last_reconnect_attempt = now;
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			if (walkeeper[i].state == SS_OFFLINE)
-				ResetConnection(&walkeeper[i]);
-		}
-	}
-}
+				/* We have quorum for voting, send our vote request */
+			case SS_SEND_VOTE:
+				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
+				/* On failure, logging & resetting is handled */
+				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+					return;
 
-/*
- * Receive WAL from most advanced WAL keeper
- */
-static bool
-WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
-{
-	char		conninfo[MAXCONNINFO];
-	char	   *err;
-	WalReceiverConn *wrconn;
-	WalRcvStreamOptions options;
+				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
+				break;
 
-	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
-	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
-	if (!wrconn)
-	{
-		ereport(WARNING,
-				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						walkeeper[donor].host, walkeeper[donor].port,
-						err)));
-		return false;
-	}
-	elog(LOG,
-		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
-		 "%d",
-		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
-		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+				/* Start reading the walkeeper response for our candidate */
+			case SS_WAIT_VERDICT:
+				wk->voteResponse.apm.tag = 'v';
+				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
+					return;
 
-	options.logical = false;
-	options.startpoint = startpos;
-	options.slotname = NULL;
-	options.proto.physical.startpointTLI = timeline;
+				elog(LOG,
+					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
+					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
+					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
 
-	if (walrcv_startstreaming(wrconn, &options))
-	{
-		XLogRecPtr	rec_start_lsn;
-		XLogRecPtr	rec_end_lsn = 0;
-		int			len;
-		char	   *buf;
-		pgsocket	wait_fd = PGINVALID_SOCKET;
+				/*
+				 * In case of acceptor rejecting our vote, bail out, but only
+				 * if either it already lives in strictly higher term
+				 * (concurrent compute spotted) or we are not elected yet and
+				 * thus need the vote.
+				 */
+				if ((!wk->voteResponse.voteGiven) &&
+					(wk->voteResponse.term > propTerm || n_votes < quorum))
+				{
+					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+						 wk->host, wk->port,
+						 wk->voteResponse.term, propTerm);
+				}
+				Assert(wk->voteResponse.term == propTerm);
+
+				/* Handshake completed, do we have quorum? */
+				n_votes++;
+				if (n_votes < quorum)
+				{
+					wk->state = SS_IDLE; /* can't do much yet, no quorum */
+				}
+				else if (n_votes > quorum)
+				{
 
-		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
-		{
-			if (len == 0)
-			{
-				(void) WaitLatchOrSocket(
-										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
-			}
-			else
-			{
-				Assert(buf[0] == 'w' || buf[0] == 'k');
-				if (buf[0] == 'k')
-					continue; /* keepalive */
-				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
-					   sizeof rec_start_lsn);
-				rec_start_lsn = pg_ntoh64(rec_start_lsn);
-				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-				(void) CreateMessage(rec_start_lsn, buf, len);
-				elog(DEBUG1, "Recover message %X/%X length %d",
-					 LSN_FORMAT_ARGS(rec_start_lsn), len);
-				if (rec_end_lsn >= endpos)
-					break;
-			}
-		}
-		elog(DEBUG1, "end of replication stream at %X/%X: %m",
-			 LSN_FORMAT_ARGS(rec_end_lsn));
-		walrcv_disconnect(wrconn);
-	}
-	else
-	{
-		ereport(LOG,
-				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
-						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
-		return false;
-	}
+					/* recovery already performed, just start streaming */
+					SendProposerElected(wk);
+				}
+				else
+				{
+					wk->state = SS_IDLE;
+					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+															 * read-ready */
 
-	return true;
-}
+					DetermineEpochStartLsn();
 
-/*
- * Determine for wk the starting streaming point and send it message
- * 1) Announcing we are elected proposer (which immediately advances epoch if
- *    safekeeper is synced, being important for sync-safekeepers)
- * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
- *    beyond it -- and history of term switching.
- * 
- * Sets wk->startStreamingAt.
- */
-static void
-SendProposerElected(WalKeeper *wk)
-{
-	ProposerElected msg;
-	TermHistory *th;
-	term_t lastCommonTerm;
-	int i;
+					/*
+					 * Check if not all safekeepers are up-to-date, we need to
+					 * download WAL needed to synchronize them
+					 */
+					if (truncateLsn < propEpochStartLsn)
+					{
+						elog(LOG,
+							 "start recovery because truncateLsn=%X/%X is not "
+							 "equal to epochStartLsn=%X/%X",
+							 LSN_FORMAT_ARGS(truncateLsn),
+							 LSN_FORMAT_ARGS(propEpochStartLsn));
+						/* Perform recovery */
+						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
+							elog(FATAL, "Failed to recover state");
+					}
+					else if (syncSafekeepers)
+					{
+						/* Sync is not needed: just exit */
+						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+						exit(0);
+					}
 
-	/* 
-	 * Determine start LSN by comparing safekeeper's log term switch history and
-	 * proposer's, searching for the divergence point.
-	 *
-	 * Note: there is a vanishingly small chance of no common point even if
-	 * there is some WAL on safekeeper, if immediately after bootstrap compute
-	 * wrote some WAL on single sk and died; we stream since the beginning then.
-	 */
-	th = &wk->voteResponse.termHistory;
-	/* 
-	 * If any WAL is present on the sk, it must be authorized by some term.
-	 * OTOH, without any WAL there are no term swiches in the log.
-	 */
-	Assert((th->n_entries == 0) ==
-		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
-	/* We must start somewhere. */
-	Assert(propTermHistory.n_entries >= 1);
+					for (int i = 0; i < n_walkeepers; i++)
+					{
+						if (walkeeper[i].state == SS_IDLE)
+							SendProposerElected(&walkeeper[i]);
+					}
 
-	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
-	{
-		if (propTermHistory.entries[i].term != th->entries[i].term)
-			break;
-		/* term must begin everywhere at the same point */
-		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
-	}
-	i--; /* step back to the last common term */
-	if (i < 0)
-	{
-		/* safekeeper is empty or no common point, start from the beginning */
-		wk->startStreamingAt = propTermHistory.entries[0].lsn;
-	}
-	else
-	{
-		/*
-		 * End of (common) term is the start of the next except it is the last
-		 * one; there it is flush_lsn in case of safekeeper or, in case of
-		 * proposer, LSN it is currently writing, but then we just pick
-		 * safekeeper pos as it obviously can't be higher.
-		 */
-		if (propTermHistory.entries[i].term == propTerm)
-		{
-			wk->startStreamingAt = wk->voteResponse.flushLsn;
-		}
-		else
-		{
-			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
-			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
-														   wk->voteResponse.flushLsn);
-			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
-		}
-	}
+					/* 
+					 * The proposer has been elected, and there will be no quorum waiting
+					 * after this point. There will be no safekeeper with state SS_IDLE
+					 * also, because that state is used only for quorum waiting.
+					 */
 
-	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+					if (syncSafekeepers)
+					{
+						/*
+						 * Queue empty message to enforce receiving feedback
+						 * even from nodes who are fully recovered; this is
+						 * required to learn they switched epoch which finishes
+						 * sync-safeekepers who doesn't generate any real new
+						 * records. Will go away once we switch to async acks.
+						 */
+						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
-	msg.tag = 'e';
-	msg.term = propTerm;
-	msg.startStreamingAt = wk->startStreamingAt;
-	msg.termHistory = &propTermHistory;
+						/* keep polling until all walkeepers are synced */
+						return;
+					}
 
-	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
-	elog(LOG,
-		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
-	
-	resetStringInfo(&wk->outbuf);
-	pq_sendint64_le(&wk->outbuf, msg.tag);
-	pq_sendint64_le(&wk->outbuf, msg.term);
-	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
-	for (int i = 0; i < msg.termHistory->n_entries; i++)
-	{
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
-	}
+					WalProposerStartStreaming(propEpochStartLsn);
+					/* Should not return here */
+				}
 
-	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
-		return;
+				break;
 
-	StartStreaming(wk);
-}
+			/* Flush proposer announcement message */
+			case SS_SEND_ELECTED_FLUSH:
 
-/*
- * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
- */
-static void
-StartStreaming(WalKeeper *wk)
-{
-	int wki = wk - walkeeper;
+				/*
+				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll
+				 * wait until the next poll comes along.
+				 */
+				if (!AsyncFlush(wk))
+					return;
+				
+				StartStreaming(wk);
 
-	/* 
-	 * This is the only entrypoint to state SS_ACTIVE. It's executed
-	 * exactly once for a connection.
-	 */
-	wk->state = SS_ACTIVE;
-	UpdateEventSet(wk, WL_SOCKET_READABLE);
+				break;
 
-	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-	{
-		if (msg->req.endLsn <= wk->startStreamingAt)
-		{
-			/* message is already received by this walkeeper */
-			msg->ackMask |= 1 << wki;
-		}
-		else
-		{
-			SendMessageToNode(wki, msg);
-			return;
-		}
-	}
-}
 
-/*
- * Advance the WAL proposer state machine, waiting each time for events to occur
- */
-void
-WalProposerPoll(void)
-{
-	while (true)
-	{
-		WalKeeper  *wk;
-		int			rc;
-		int			i;
-		WaitEvent	event;
-		TimestampTz now = GetCurrentTimestamp();
+				/*
+				 * Idle state for sending WAL. Moved out only by calls to
+				 * SendMessageToNode
+				 */
+			case SS_IDLE:
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(wk);
+				break;
 
-		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		wk = (WalKeeper *) event.user_data;
-		i = (int) (wk - walkeeper);
 
-		/*
-		 * If the event contains something that one of our walkeeper states
-		 * was waiting for, we'll advance its state.
-		 */
-		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(i, event.events);
+			case SS_ACTIVE:
+				if (events & WL_SOCKET_WRITEABLE)
+					if (!SendAppendRequests(wk))
+						return;
+
+				if (events & WL_SOCKET_READABLE)
+					if (!RecvAppendResponses(wk))
+						return;
 
-		/*
-		 * If the timeout expired, attempt to reconnect to any walkeepers that
-		 * we dropped
-		 */
-		ReconnectWalKeepers();
+				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
+				break;
+		}
 
 		/*
-		 * If wait is terminated by latch set (walsenders' latch is set on
-		 * each wal flush), then exit loop. (no need for pm death check due to
-		 * WL_EXIT_ON_PM_DEATH)
+		 * We've already done something for these events - don't attempt more
+		 * states than we need to.
 		 */
-		if (rc != 0 && (event.events & WL_LATCH_SET))
-		{
-			ResetLatch(MyLatch);
-			break;
-		}
-		if (rc == 0) /* timeout expired: poll state */
-		{
-			/*
-			 * If no WAL was generated during timeout (and we have already
-			 * collected the quorum), then send pool message
-			 */
-			if (lastSentLsn != InvalidXLogRecPtr)
-			{
-				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-			}
-		}
+		events = WL_NO_EVENTS;
 	}
 }
 
+/* latest term in TermHistory, or 0 is there is no entries */
+static term_t
+GetHighestTerm(TermHistory *th)
+{
+	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
+}
+
+/* safekeeper's epoch is the term of the highest entry in the log */
+static term_t
+GetEpoch(WalKeeper *wk)
+{
+	return GetHighestTerm(&wk->voteResponse.termHistory);
+}
+
 /*
- * Send queue messages starting from wk->currMsg until the end or non-writable
- * socket, whichever comes first.
- * 
- * Can change state if Async* functions encounter errors and reset connection.
- * Returns false in this case, true otherwise.
+ * Called after majority of acceptors gave votes, it calculates the most
+ * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
+ * which we'll write WAL in our term.
+ *
+ * Sets truncateLsn along the way (though it is not of much use at this point --
+ * only for skipping recovery).
  */
-static bool
-SendAppendRequests(WalKeeper *wk)
+static void
+DetermineEpochStartLsn(void)
 {
-	int wki = wk - walkeeper;
-	WalMessage *msg;
-	AppendRequestHeader *req;
+	TermHistory *dth;
 
-	if (wk->flushWrite)
-	{
-		if (!AsyncFlush(wk))
-			/* 
-			 * AsyncFlush failed, that could happen if the socket is closed or
-			 * we have nothing to write and should wait for writeable socket.
-			 */
-			return wk->state == SS_ACTIVE;
+	propEpochStartLsn = InvalidXLogRecPtr;
+	donorEpoch = 0;
+	truncateLsn = InvalidXLogRecPtr;
 
-		wk->currMsg = wk->currMsg->next;
-		wk->flushWrite = false;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE)
+		{
+			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
+				(GetEpoch(&walkeeper[i]) == donorEpoch &&
+				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
+			{
+				donorEpoch = GetEpoch(&walkeeper[i]);
+				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
+				donor = i;
+			}
+			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
+		}
 	}
 
-	while (wk->currMsg)
+	/*
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
+	 * was committed yet. To keep the idea of always starting streaming since
+	 * record boundary (which simplifies decoding on safekeeper), take start
+	 * position of the slot.
+	 */
+	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
-		msg = wk->currMsg;
-		req = &msg->req;
+		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
+		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		ReplicationSlotRelease();
+		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
+	}
 
-		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
-		req->truncateLsn = truncateLsn;
+	/*
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
+	 * some connected safekeeper; it must have carried truncateLsn pointing to
+	 * the first record.
+	 */
+	Assert((truncateLsn != InvalidXLogRecPtr) ||
+		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
 
-		Assert((msg->ackMask & (1 << wki)) == 0);
+	/*
+	 * Proposer's term history is the donor's + its own entry.
+	 */
+	dth = &walkeeper[donor].voteResponse.termHistory;
+	propTermHistory.n_entries = dth->n_entries + 1;
+	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
+	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
+	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
 
-		/*
-		 * If we need to send this message not from the beginning,
-		 * form the cut version. Only happens for the first
-		 * message.
-		 */
-		if (wk->startStreamingAt > msg->req.beginLsn)
-		{
-			uint32		len;
-			uint32		size;
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+		 quorum,
+		 propTerm,
+		 LSN_FORMAT_ARGS(propEpochStartLsn),
+		 walkeeper[donor].host, walkeeper[donor].port,
+		 LSN_FORMAT_ARGS(truncateLsn)
+		);
+}
 
-			Assert(wk->startStreamingAt < req->endLsn);
+/*
+ * Receive WAL from most advanced WAL keeper
+ */
+static bool
+WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+{
+	char		conninfo[MAXCONNINFO];
+	char	   *err;
+	WalReceiverConn *wrconn;
+	WalRcvStreamOptions options;
 
-			len = msg->req.endLsn - wk->startStreamingAt;
-			size = sizeof(AppendRequestHeader) + len;
-			req = malloc(size);
-			*req = msg->req;
-			req->beginLsn = wk->startStreamingAt;
-			memcpy(req + 1,
-					(char *) (&msg->req + 1) + wk->startStreamingAt -
-					msg->req.beginLsn,
-					len);
-		}
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+	if (!wrconn)
+	{
+		ereport(WARNING,
+				(errmsg("could not connect to WAL acceptor %s:%s: %s",
+						walkeeper[donor].host, walkeeper[donor].port,
+						err)));
+		return false;
+	}
+	elog(LOG,
+		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
-		elog(LOG,
-				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-				req->endLsn - req->beginLsn,
-				LSN_FORMAT_ARGS(req->beginLsn),
-				LSN_FORMAT_ARGS(req->endLsn),
-				LSN_FORMAT_ARGS(req->commitLsn),
-				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+	options.logical = false;
+	options.startpoint = startpos;
+	options.slotname = NULL;
+	options.proto.physical.startpointTLI = timeline;
 
-		/* if this is the first sent message, we should start processing feedback */
-		if (wk->ackMsg == NULL)
-			wk->ackMsg = wk->currMsg;
+	if (walrcv_startstreaming(wrconn, &options))
+	{
+		XLogRecPtr	rec_start_lsn;
+		XLogRecPtr	rec_end_lsn = 0;
+		int			len;
+		char	   *buf;
+		pgsocket	wait_fd = PGINVALID_SOCKET;
 
-		/*
-		 * We write with msg->size here because the body of the
-		 * message is stored after the end of the WalMessage
-		 * struct, in the allocation for each msg
-		 */
-		if (!AsyncWrite(wk, req,
-						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
-						SS_ACTIVE))
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
 		{
-			if (req != &msg->req)
-				free(req);
-			if (wk->state == SS_ACTIVE)
+			if (len == 0)
 			{
-				wk->flushWrite = true;
-				return true;
+				(void) WaitLatchOrSocket(
+										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+			}
+			else
+			{
+				Assert(buf[0] == 'w' || buf[0] == 'k');
+				if (buf[0] == 'k')
+					continue; /* keepalive */
+				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
+					   sizeof rec_start_lsn);
+				rec_start_lsn = pg_ntoh64(rec_start_lsn);
+				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+				(void) CreateMessage(rec_start_lsn, buf, len);
+				elog(DEBUG1, "Recover message %X/%X length %d",
+					 LSN_FORMAT_ARGS(rec_start_lsn), len);
+				if (rec_end_lsn >= endpos)
+					break;
 			}
-			return false;
 		}
-		if (req != &msg->req)
-			free(req);
-
-		/* continue writing the next message */
-		wk->currMsg = wk->currMsg->next;
+		elog(DEBUG1, "end of replication stream at %X/%X: %m",
+			 LSN_FORMAT_ARGS(rec_end_lsn));
+		walrcv_disconnect(wrconn);
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
+						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
+		return false;
 	}
 
 	return true;
 }
 
 /*
- * Receive and process all available feedback.
- *
- * Can change state if Async* functions encounter errors and reset connection.
- * Returns false in this case, true otherwise.
+ * Determine for wk the starting streaming point and send it message
+ * 1) Announcing we are elected proposer (which immediately advances epoch if
+ *    safekeeper is synced, being important for sync-safekeepers)
+ * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
+ *    beyond it -- and history of term switching.
  * 
- * NB: This function can call SendMessageToNode and produce new messages.
+ * Sets wk->startStreamingAt.
  */
-static bool
-RecvAppendResponses(WalKeeper *wk)
+static void
+SendProposerElected(WalKeeper *wk)
 {
-	XLogRecPtr	minQuorumLsn;
-	int wki = wk - walkeeper;
-	bool readAnything = false;
+	ProposerElected msg;
+	TermHistory *th;
+	term_t lastCommonTerm;
+	int i;
 
-	while (true)
+	/* 
+	 * Determine start LSN by comparing safekeeper's log term switch history and
+	 * proposer's, searching for the divergence point.
+	 *
+	 * Note: there is a vanishingly small chance of no common point even if
+	 * there is some WAL on safekeeper, if immediately after bootstrap compute
+	 * wrote some WAL on single sk and died; we stream since the beginning then.
+	 */
+	th = &wk->voteResponse.termHistory;
+	/* 
+	 * If any WAL is present on the sk, it must be authorized by some term.
+	 * OTOH, without any WAL there are no term swiches in the log.
+	 */
+	Assert((th->n_entries == 0) ==
+		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
+	/* We must start somewhere. */
+	Assert(propTermHistory.n_entries >= 1);
+
+	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
 	{
-		/*
-		 * If our reading doesn't immediately succeed, any
-		 * necessary error handling or state setting is taken care
-		 * of. We can leave any other work until later.
-		 */
-		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+		if (propTermHistory.entries[i].term != th->entries[i].term)
 			break;
-
-		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
-
+		/* term must begin everywhere at the same point */
+		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
+	}
+	i--; /* step back to the last common term */
+	if (i < 0)
+	{
+		/* safekeeper is empty or no common point, start from the beginning */
+		wk->startStreamingAt = propTermHistory.entries[0].lsn;
+	}
+	else
+	{
 		/*
-		 * We shouldn't read responses ahead of wk->currMsg, because that will
-		 * look like we are receiving responses for messages that haven't been
-		 * sent yet. This can happen when message was placed in a buffer in 
-		 * SendAppendRequests, but sent through a wire only with a flush inside
-		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 * End of (common) term is the start of the next except it is the last
+		 * one; there it is flush_lsn in case of safekeeper or, in case of
+		 * proposer, LSN it is currently writing, but then we just pick
+		 * safekeeper pos as it obviously can't be higher.
 		 */
-		if (wk->ackMsg == wk->currMsg)
+		if (propTermHistory.entries[i].term == propTerm)
 		{
-			/* Couldn't happen without flush flag */
-			Assert(wk->flushWrite);
-
-			wk->currMsg = wk->currMsg->next;
-			wk->flushWrite = false;
+			wk->startStreamingAt = wk->voteResponse.flushLsn;
+		}
+		else
+		{
+			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
+			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
+														   wk->voteResponse.flushLsn);
+			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
 		}
+	}
 
-		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
-											* receiving of this
-											* message */
+	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
 
-		wk->ackMsg = wk->ackMsg->next;
-		readAnything = true;
+	msg.tag = 'e';
+	msg.term = propTerm;
+	msg.startStreamingAt = wk->startStreamingAt;
+	msg.termHistory = &propTermHistory;
+
+	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
+	elog(LOG,
+		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
+		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
+	
+	resetStringInfo(&wk->outbuf);
+	pq_sendint64_le(&wk->outbuf, msg.tag);
+	pq_sendint64_le(&wk->outbuf, msg.term);
+	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
+	for (int i = 0; i < msg.termHistory->n_entries; i++)
+	{
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
 	}
 
-	if (!readAnything)
-		return wk->state == SS_ACTIVE;
+	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+		return;
 
-	HandleWalKeeperResponse();
+	StartStreaming(wk);
+}
 
-	/*
-	 * Also send the new commit lsn to all the walkeepers.
-	 *
-	 * FIXME: This is redundant for walkeepers that have other
-	 * outbound messages pending.
+/*
+ * Start walsender streaming replication
+ */
+static void
+WalProposerStartStreaming(XLogRecPtr startpos)
+{
+	StartReplicationCmd cmd;
+
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
+	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
+	cmd.timeline = proposerGreeting.timeline;
+	cmd.startpoint = startpos;
+	StartReplication(&cmd);
+}
+
+/*
+ * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
+ */
+static void
+StartStreaming(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+
+	/* 
+	 * This is the only entrypoint to state SS_ACTIVE. It's executed
+	 * exactly once for a connection.
 	 */
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	if (minQuorumLsn > lastSentCommitLsn)
+	wk->state = SS_ACTIVE;
+	UpdateEventSet(wk, WL_SOCKET_READABLE);
+
+	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
-		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-		lastSentCommitLsn = minQuorumLsn;
+		if (msg->req.endLsn <= wk->startStreamingAt)
+		{
+			/* message is already received by this walkeeper */
+			msg->ackMask |= 1 << wki;
+		}
+		else
+		{
+			SendMessageToNode(wki, msg);
+			return;
+		}
 	}
-
-	return wk->state == SS_ACTIVE;
 }
 
-/* Performs the logic for advancing the state machine of the 'i'th walkeeper,
- * given that a certain set of events has occured. */
+/*
+ * Start sending message to the particular node.
+ *
+ * Always updates the state and event set for the WAL keeper; setting either of
+ * these before calling would be redundant work.
+ */
 static void
-AdvancePollState(int i, uint32 events)
+SendMessageToNode(int i, WalMessage *msg)
 {
 	WalKeeper  *wk = &walkeeper[i];
+
+	/* we shouldn't be already sending something */
+	Assert(wk->currMsg == NULL);
+
 	/*
-	 * Keep advancing the state while either: (a) the event is still
-	 * unprocessed (usually because it's the first iteration of the loop), or
-	 * (b) the state can execute, and does not need to wait for any socket
-	 * events
+	 * Skip already acknowledged messages. Used after reconnection to get to
+	 * the first not yet sent message. Otherwise we always just send 'msg'.
 	 */
-	while (events || StateShouldImmediatelyExecute(wk->state))
-	{
-		/*
-		 * Sanity check. We assume further down that the operations don't
-		 * block because the socket is ready.
-		 */
-		AssertEventsOkForState(events, wk);
+	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
+		msg = msg->next;
 
-		/* Execute the code corresponding to the current state */
-		switch (wk->state)
+	wk->currMsg = msg;
+	wk->flushWrite = false;
+
+	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
+	if (!SendAppendRequests(wk))
+		return;
+}
+
+/*
+ * Broadcast new message to all caught-up walkeepers
+ */
+static void
+BroadcastMessage(WalMessage *msg)
+{
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
 		{
-				/*
-				 * WAL keepers are only taken out of SS_OFFLINE by calls to
-				 * ResetConnection
-				 */
-			case SS_OFFLINE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
-					 wk->host, wk->port);
-				break;			/* actually unreachable, but prevents
-								 * -Wimplicit-fallthrough */
+			SendMessageToNode(i, msg);
+		}
+	}
+}
+
+static WalMessage *
+CreateMessage(XLogRecPtr startpos, char *data, int len)
+{
+	/* Create new message and append it to message queue */
+	WalMessage *msg;
+	XLogRecPtr	endpos;
+
+	len -= XLOG_HDR_SIZE;
+	endpos = startpos + len;
+	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
+	{
+		/* Message already queued */
+		return NULL;
+	}
+	Assert(len >= 0);
+	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
+
+	msg->size = sizeof(AppendRequestHeader) + len;
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.epochStartLsn = propEpochStartLsn;
+	msg->req.beginLsn = startpos;
+	msg->req.endLsn = endpos;
+	msg->req.proposerId = proposerGreeting.proposerId;
+	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
 
-				/*
-				 * Both connecting states run the same logic. The only
-				 * difference is the events they're expecting
-				 */
-			case SS_CONNECTING_READ:
-			case SS_CONNECTING_WRITE:
-				{
-					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
+	Assert(msg->req.endLsn >= lastSentLsn);
+	lastSentLsn = msg->req.endLsn;
+	return msg;
+}
 
-					/* The new set of events we'll wait on, after updating */
-					uint32		new_events = WL_NO_EVENTS;
+/*
+ * Create WAL message with no data, just to let the walkeepers
+ * know that commit lsn has advanced.
+ */
+static WalMessage *
+CreateMessageCommitLsnOnly(XLogRecPtr lsn)
+{
+	/* Create new message and append it to message queue */
+	WalMessage *msg;
 
-					switch (result)
-					{
-						case WP_CONN_POLLING_OK:
-							elog(LOG, "connected with node %s:%s", wk->host,
-								 wk->port);
+	msg = (WalMessage *) malloc(sizeof(WalMessage));
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
 
-							/*
-							 * Once we're fully connected, we can move to the
-							 * next state
-							 */
-							wk->state = SS_EXEC_STARTWALPUSH;
+	msg->size = sizeof(AppendRequestHeader);
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.epochStartLsn = propEpochStartLsn;
 
-							/*
-							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
-							 * on anything, we do need to replace the current
-							 * event, so we have to just pick something. We'll
-							 * eventually need the socket to be readable, so
-							 * we go with that.
-							 */
-							new_events = WL_SOCKET_READABLE;
-							break;
+	/*
+	 * This serves two purposes: 1) After all msgs from previous epochs are
+	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
+	 * commands to switch the epoch, which allows to do the switch without
+	 * creating new epoch records (we especially want to avoid such in --sync
+	 * mode). Walproposer can advance commit_lsn only after the switch, so
+	 * this lsn (reported back) also is the first possible advancement point.
+	 * 2) Maintain common invariant of queue entries sorted by LSN.
+	 */
+	msg->req.beginLsn = lsn;
+	msg->req.endLsn = lsn;
+	msg->req.proposerId = proposerGreeting.proposerId;
 
-							/*
-							 * If we need to poll to finish connecting,
-							 * continue doing that
-							 */
-						case WP_CONN_POLLING_READING:
-							wk->state = SS_CONNECTING_READ;
-							new_events = WL_SOCKET_READABLE;
-							break;
-						case WP_CONN_POLLING_WRITING:
-							wk->state = SS_CONNECTING_WRITE;
-							new_events = WL_SOCKET_WRITEABLE;
-							break;
+	/*
+	 * truncateLsn and commitLsn are set just before the message sent, in
+	 * SendMessageToNode()
+	 */
+	return msg;
+}
 
-						case WP_CONN_POLLING_FAILED:
-							elog(WARNING, "Failed to connect to node '%s:%s': %s",
-								 wk->host, wk->port, walprop_error_message(wk->conn));
+/*
+ * Send queue messages starting from wk->currMsg until the end or non-writable
+ * socket, whichever comes first.
+ * 
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ */
+static bool
+SendAppendRequests(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+	WalMessage *msg;
+	AppendRequestHeader *req;
 
-							/*
-							 * If connecting failed, we don't want to restart
-							 * the connection because that might run us into a
-							 * loop. Instead, shut it down -- it'll naturally
-							 * restart at a slower interval on calls to
-							 * ReconnectWalKeepers.
-							 */
-							ShutdownConnection(wk);
-							return;
-					}
+	if (wk->flushWrite)
+	{
+		if (!AsyncFlush(wk))
+			/* 
+			 * AsyncFlush failed, that could happen if the socket is closed or
+			 * we have nothing to write and should wait for writeable socket.
+			 */
+			return wk->state == SS_ACTIVE;
 
-					/*
-					 * Because PQconnectPoll can change the socket, we have to
-					 * un-register the old event and re-register an event on
-					 * the new socket.
-					 */
-					HackyRemoveWalProposerEvent(wk);
-					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
-					break;
-				}
+		wk->currMsg = wk->currMsg->next;
+		wk->flushWrite = false;
+	}
 
-				/*
-				 * Send "START_WAL_PUSH" command to the walkeeper. After
-				 * sending, wait for response with SS_WAIT_EXEC_RESULT
-				 */
-			case SS_EXEC_STARTWALPUSH:
-				{
-					char *query = NULL;
-					if (zenith_pageserver_connstring_walproposer != NULL) {
-						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
-					} else {
-						query = psprintf("START_WAL_PUSH");
-					}
-					if (!walprop_send_query(wk->conn, query))
-					{
-						pfree(query);
-						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
-					}
-					pfree(query);
-					wk->state = SS_WAIT_EXEC_RESULT;
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-					break;
-				}
+	while (wk->currMsg)
+	{
+		msg = wk->currMsg;
+		req = &msg->req;
 
-			case SS_WAIT_EXEC_RESULT:
-				switch (walprop_get_query_result(wk->conn))
-				{
-						/*
-						 * Successful result, move on to starting the
-						 * handshake
-						 */
-					case WP_EXEC_SUCCESS_COPYBOTH:
+		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+		req->truncateLsn = truncateLsn;
 
-						/*
-						 * Because this state is immediately executable, we'll
-						 * start this on the next iteration of the loop
-						 */
-						wk->state = SS_HANDSHAKE_SEND;
-						break;
+		Assert((msg->ackMask & (1 << wki)) == 0);
 
-						/*
-						 * Needs repeated calls to finish. Wait until the
-						 * socket is readable
-						 */
-					case WP_EXEC_NEEDS_INPUT:
+		/*
+		 * If we need to send this message not from the beginning,
+		 * form the cut version. Only happens for the first
+		 * message.
+		 */
+		if (wk->startStreamingAt > msg->req.beginLsn)
+		{
+			uint32		len;
+			uint32		size;
 
-						/*
-						 * SS_WAIT_EXEC_RESULT is always reached through an
-						 * event, so we don't need to update the event set
-						 */
-						break;
+			Assert(wk->startStreamingAt < req->endLsn);
 
-					case WP_EXEC_FAILED:
-						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
+			len = msg->req.endLsn - wk->startStreamingAt;
+			size = sizeof(AppendRequestHeader) + len;
+			req = malloc(size);
+			*req = msg->req;
+			req->beginLsn = wk->startStreamingAt;
+			memcpy(req + 1,
+					(char *) (&msg->req + 1) + wk->startStreamingAt -
+					msg->req.beginLsn,
+					len);
+		}
 
-						/*
-						 * Unexpected result -- funamdentally an error, but we
-						 * want to produce a custom message, rather than a
-						 * generic "something went wrong"
-						 */
-					case WP_EXEC_UNEXPECTED_SUCCESS:
-						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
-							 wk->host, wk->port);
-						ShutdownConnection(wk);
-						return;
-				}
-				break;
+		elog(LOG,
+				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				req->endLsn - req->beginLsn,
+				LSN_FORMAT_ARGS(req->beginLsn),
+				LSN_FORMAT_ARGS(req->endLsn),
+				LSN_FORMAT_ARGS(req->commitLsn),
+				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
-				/*
-				 * Start handshake: first of all send information about the
-				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
-				 * a response to finish the handshake.
-				 */
-			case SS_HANDSHAKE_SEND:
+		/* if this is the first sent message, we should start processing feedback */
+		if (wk->ackMsg == NULL)
+			wk->ackMsg = wk->currMsg;
 
-				/*
-				 * On failure, logging & resetting the connection is handled.
-				 * We just need to handle the control flow.
-				 */
-				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
-					return;
+		/*
+		 * We write with msg->size here because the body of the
+		 * message is stored after the end of the WalMessage
+		 * struct, in the allocation for each msg
+		 */
+		if (!AsyncWrite(wk, req,
+						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
+						SS_ACTIVE))
+		{
+			if (req != &msg->req)
+				free(req);
+			if (wk->state == SS_ACTIVE)
+			{
+				wk->flushWrite = true;
+				return true;
+			}
+			return false;
+		}
+		if (req != &msg->req)
+			free(req);
 
-				break;
+		/* continue writing the next message */
+		wk->currMsg = wk->currMsg->next;
+	}
 
-				/*
-				 * Finish handshake comms: receive information about the WAL
-				 * keeper
-				 */
-			case SS_HANDSHAKE_RECV:
+	return true;
+}
 
-				/*
-				 * If our reading doesn't immediately succeed, any necessary
-				 * error handling or state setting is taken care of. We can
-				 * leave any other work until later.
-				 */
-				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
-					return;
+/*
+ * Receive and process all available feedback.
+ *
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ * 
+ * NB: This function can call SendMessageToNode and produce new messages.
+ */
+static bool
+RecvAppendResponses(WalKeeper *wk)
+{
+	XLogRecPtr	minQuorumLsn;
+	int wki = wk - walkeeper;
+	bool readAnything = false;
 
-				/* Protocol is all good, move to voting. */
-				wk->state = SS_VOTING;
+	while (true)
+	{
+		/*
+		 * If our reading doesn't immediately succeed, any
+		 * necessary error handling or state setting is taken care
+		 * of. We can leave any other work until later.
+		 */
+		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+			break;
 
-				/*
-				 * Don't need to update the event set yet. Either we update
-				 * the event set to WL_SOCKET_READABLE *or* we change the
-				 * state to SS_SEND_VOTE in the loop below
-				 */
-				UpdateEventSet(wk, WL_SOCKET_READABLE);
-				wk->feedback.flushLsn = truncateLsn;
-				wk->feedback.hs.ts = 0;
+		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
 
-				/*
-				 * We want our term to be highest and unique, so choose max
-				 * and +1 once we have majority.
-				 */
-				propTerm = Max(walkeeper[i].greet.term, propTerm);
+		/*
+		 * We shouldn't read responses ahead of wk->currMsg, because that will
+		 * look like we are receiving responses for messages that haven't been
+		 * sent yet. This can happen when message was placed in a buffer in 
+		 * SendAppendRequests, but sent through a wire only with a flush inside
+		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 */
+		if (wk->ackMsg == wk->currMsg)
+		{
+			/* Couldn't happen without flush flag */
+			Assert(wk->flushWrite);
 
-				/*
-				 * Check if we have quorum. If there aren't enough walkeepers,
-				 * wait and do nothing. We'll eventually get a task when the
-				 * election starts.
-				 *
-				 * If we do have quorum, we can start an election
-				 */
-				if (++n_connected < quorum)
-				{
-					/*
-					 * SS_VOTING is an idle state; read-ready indicates the
-					 * connection closed.
-					 */
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-				}
-				else
-				{
-					if (n_connected == quorum)
-					{
-						propTerm++;
-						/* prepare voting message */
-						voteRequest = (VoteRequest)
-						{
-							.tag = 'v',
-								.term = propTerm
-						};
-						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
-					}
+			wk->currMsg = wk->currMsg->next;
+			wk->flushWrite = false;
+		}
 
-					/*
-					 * Now send voting request to the cohort and wait
-					 * responses
-					 */
-					for (int j = 0; j < n_walkeepers; j++)
-					{
-						/*
-						 * Remember: SS_VOTING indicates that the walkeeper is
-						 * participating in voting, but hasn't sent anything
-						 * yet. The ones that have sent something are given
-						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
-						 */
-						if (walkeeper[j].state == SS_VOTING)
-						{
-							walkeeper[j].state = SS_SEND_VOTE;
-							/* Immediately send info */
-							AdvancePollState(j, WL_NO_EVENTS);
-						}
-					}
-				}
-				break;
+		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
+											* receiving of this
+											* message */
 
-				/*
-				 * Voting is an idle state - we don't expect any events to
-				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
-				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
-				 */
-			case SS_VOTING:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+		wk->ackMsg = wk->ackMsg->next;
+		readAnything = true;
+	}
 
-				/* We have quorum for voting, send our vote request */
-			case SS_SEND_VOTE:
-				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
-				/* On failure, logging & resetting is handled */
-				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
-					return;
+	if (!readAnything)
+		return wk->state == SS_ACTIVE;
 
-				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
-				break;
+	HandleWalKeeperResponse();
 
-				/* Start reading the walkeeper response for our candidate */
-			case SS_WAIT_VERDICT:
-				wk->voteResponse.apm.tag = 'v';
-				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
-					return;
+	/*
+	 * Also send the new commit lsn to all the walkeepers.
+	 *
+	 * FIXME: This is redundant for walkeepers that have other
+	 * outbound messages pending.
+	 */
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastSentCommitLsn)
+	{
+		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+		lastSentCommitLsn = minQuorumLsn;
+	}
 
-				elog(LOG,
-					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
-					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
-					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+	return wk->state == SS_ACTIVE;
+}
 
-				/*
-				 * In case of acceptor rejecting our vote, bail out, but only
-				 * if either it already lives in strictly higher term
-				 * (concurrent compute spotted) or we are not elected yet and
-				 * thus need the vote.
-				 */
-				if ((!wk->voteResponse.voteGiven) &&
-					(wk->voteResponse.term > propTerm || n_votes < quorum))
-				{
-					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-						 wk->host, wk->port,
-						 wk->voteResponse.term, propTerm);
-				}
-				Assert(wk->voteResponse.term == propTerm);
+/*
+ * Combine hot standby feedbacks from all walkeepers.
+ */
+static void
+CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
+{
+	hs->ts = 0;
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
 
-				/* Handshake completed, do we have quorum? */
-				n_votes++;
-				if (n_votes < quorum)
-				{
-					wk->state = SS_IDLE; /* can't do much yet, no quorum */
-				}
-				else if (n_votes > quorum)
-				{
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.hs.ts != 0)
+		{
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
+			{
+				hs->xmin = walkeeper[i].feedback.hs.xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
+			{
+				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+		}
+	}
+}
 
-					/* recovery already performed, just start streaming */
-					SendProposerElected(wk);
-				}
-				else
-				{
-					wk->state = SS_IDLE;
-					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
-															 * read-ready */
+/*
+ * Get minimum of disk consistent LSNs of all safekeepers
+ */
+static XLogRecPtr
+CalculateDiskConsistentLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
+		{
+			lsn = walkeeper[i].feedback.diskConsistentLsn;
+		}
+	}
+	return lsn;
+}
 
-					DetermineEpochStartLsn();
+/*
+ * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
+ * last WAL record that can be safely discarded.
+ */
+static XLogRecPtr
+CalculateMinFlushLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.flushLsn < lsn)
+			lsn = walkeeper[i].feedback.flushLsn;
+	}
+	return lsn;
+}
 
-					/*
-					 * Check if not all safekeepers are up-to-date, we need to
-					 * download WAL needed to synchronize them
-					 */
-					if (truncateLsn < propEpochStartLsn)
-					{
-						elog(LOG,
-							 "start recovery because truncateLsn=%X/%X is not "
-							 "equal to epochStartLsn=%X/%X",
-							 LSN_FORMAT_ARGS(truncateLsn),
-							 LSN_FORMAT_ARGS(propEpochStartLsn));
-						/* Perform recovery */
-						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
-							elog(FATAL, "Failed to recover state");
-					}
-					else if (syncSafekeepers)
-					{
-						/* Sync is not needed: just exit */
-						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-						exit(0);
-					}
+/*
+ * Calculate WAL position acknowledged by quorum
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(void)
+{
+	XLogRecPtr	responses[MAX_WALKEEPERS];
 
-					for (int i = 0; i < n_walkeepers; i++)
-					{
-						if (walkeeper[i].state == SS_IDLE)
-							SendProposerElected(&walkeeper[i]);
-					}
+	/*
+	 * Sort acknowledged LSNs
+	 */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		/*
+		 * Like in Raft, we aren't allowed to commit entries from previous
+		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 */
+		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
+			walkeeper[i].feedback.flushLsn : 0;
+	}
+	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
 
-					/* 
-					 * The proposer has been elected, and there will be no quorum waiting
-					 * after this point. There will be no safekeeper with state SS_IDLE
-					 * also, because that state is used only for quorum waiting.
-					 */
+	/*
+	 * Get the smallest LSN committed by quorum
+	 */
+	return responses[n_walkeepers - quorum];
+}
 
-					if (syncSafekeepers)
-					{
-						/*
-						 * Queue empty message to enforce receiving feedback
-						 * even from nodes who are fully recovered; this is
-						 * required to learn they switched epoch which finishes
-						 * sync-safeekepers who doesn't generate any real new
-						 * records. Will go away once we switch to async acks.
-						 */
-						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+static void
+HandleWalKeeperResponse(void)
+{
+	HotStandbyFeedback hsFeedback;
+	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	diskConsistentLsn;
+	XLogRecPtr  minFlushLsn;
 
-						/* keep polling until all walkeepers are synced */
-						return;
-					}
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	diskConsistentLsn = CalculateDiskConsistentLsn();
 
-					WalProposerStartStreaming(propEpochStartLsn);
-					/* Should not return here */
-				}
+	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
+	{
 
-				break;
+		if (minQuorumLsn > lastFeedback.flushLsn)
+			lastFeedback.flushLsn = minQuorumLsn;
 
-			/* Flush proposer announcement message */
-			case SS_SEND_ELECTED_FLUSH:
+		lastFeedback.diskConsistentLsn = diskConsistentLsn;
 
-				/*
-				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll
-				 * wait until the next poll comes along.
-				 */
-				if (!AsyncFlush(wk))
-					return;
-				
-				StartStreaming(wk);
+		/* advance the replication slot */
+		if (!syncSafekeepers)
+			ProcessStandbyReply(
+								// write_lsn -  This is what durably stored in WAL service.
+								lastFeedback.flushLsn,
+								//flush_lsn - This is what durably stored in WAL service.
+								lastFeedback.flushLsn,
+								//apply_lsn - This is what processed and durably saved at pageserver.
+								lastFeedback.diskConsistentLsn,
+								GetCurrentTimestamp(), false);
+	}
 
-				break;
+	CombineHotStanbyFeedbacks(&hsFeedback);
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
+	{
+		lastFeedback.hs = hsFeedback;
+		if (!syncSafekeepers)
+			ProcessStandbyHSFeedback(hsFeedback.ts,
+									 XidFromFullTransactionId(hsFeedback.xmin),
+									 EpochFromFullTransactionId(hsFeedback.xmin),
+									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+	}
 
+	/*
+	 * Try to advance truncateLsn to minFlushLsn, which is the last record
+	 * flushed to all safekeepers. We must always start streaming from the 
+	 * beginning of the record, which simplifies decoding on the far end.
+	 *
+	 * Advanced truncateLsn should be not further than nearest commitLsn.
+	 * This prevents surprising violation of truncateLsn <= commitLsn
+	 * invariant which might occur because 1) truncateLsn can be advanced
+	 * immediately once chunk is broadcast to all safekeepers, and
+	 * commitLsn generally can't be advanced based on feedback from
+	 * safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2) chunks we
+	 * read from WAL and send are plain sheets of bytes, but safekeepers
+	 * ack only on record boundaries.
+	 */
+	minFlushLsn = CalculateMinFlushLsn();
+	if (minFlushLsn > truncateLsn)
+		truncateLsn = minFlushLsn;
 
-				/*
-				 * Idle state for sending WAL. Moved out only by calls to
-				 * SendMessageToNode
-				 */
-			case SS_IDLE:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	{
+		WalMessage *msg = msgQueueHead;
+		msgQueueHead = msg->next;
 
+		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
+		free(msg);
+	}
+	if (!msgQueueHead)			/* queue is empty */
+		msgQueueTail = NULL;
+	/* truncateLsn always points to the first chunk in the queue */
+	if (msgQueueHead)
+	{
+		/* Max takes care of special 0-sized messages */
+		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
+			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
+	}
 
-			case SS_ACTIVE:
-				if (events & WL_SOCKET_WRITEABLE)
-					if (!SendAppendRequests(wk))
-						return;
+	/*
+	 * Generally sync is done when majority switched the epoch so we committed
+	 * epochStartLsn and made the majority aware of it, ensuring they are
+	 * ready to give all WAL to pageserver. It would mean whichever majority
+	 * is alive, there will be at least one safekeeper who is able to stream
+	 * WAL to pageserver to make basebackup possible. However, since at the
+	 * moment we don't have any good mechanism of defining the healthy and
+	 * most advanced safekeeper who should push the wal into pageserver and
+	 * basically the random one gets connected, to prevent hanging basebackup
+	 * (due to pageserver connecting to not-synced-walkeeper) we currently
+	 * wait for all seemingly alive walkeepers to get synced.
+	 */
+	if (syncSafekeepers)
+	{
+		int			n_synced;
 
-				if (events & WL_SOCKET_READABLE)
-					if (!RecvAppendResponses(wk))
-						return;
+		n_synced = 0;
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			WalKeeper  *wk = &walkeeper[i];
+			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
 
-				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
-				break;
+			/* alive safekeeper which is not synced yet; wait for it */
+			if (wk->state != SS_OFFLINE && !synced)
+				return;
+			if (synced)
+				n_synced++;
+		}
+		if (n_synced >= quorum)
+		{
+			/* All walkeepers synced! */
+			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+			exit(0);
 		}
-
-		/*
-		 * We've already done something for these events - don't attempt more
-		 * states than we need to.
-		 */
-		events = WL_NO_EVENTS;
 	}
 }
 
@@ -2078,29 +2135,3 @@ AsyncFlush(WalKeeper *wk)
 			return false;
 	}
 }
-
-/*
- * WalProposerRegister
- *		Register a background worker porposing WAL to wal acceptors
- */
-void
-WalProposerRegister(void)
-{
-	BackgroundWorker bgw;
-
-	if (*wal_acceptors_list == '\0')
-		return;
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}

From 795bb1452d964dd1be90fb1fcb6de8a2d7215c6c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 4 Jan 2022 13:29:06 +0300
Subject: [PATCH 086/214] Simplify walproposer code (#114)

* Clean up walproposer states

* Migrate AsyncReadFixed to AsyncReadMessage

* Handle flushWrite better a bit

* Update SS_ACTIVE event set in single place

Now event set is updated only in the end of HandleActiveState, after
all handlers code was executed.

* Add comment on SS_ACTIVE write event

* Add TODO for SS_ACTIVE DesiredEvents
---
 src/backend/replication/walproposer.c       | 1025 ++++++++++---------
 src/backend/replication/walproposer_utils.c |   46 +-
 src/include/replication/walproposer.h       |   51 +-
 3 files changed, 554 insertions(+), 568 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index b307c79177d..14f300d110b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -110,7 +110,15 @@ static void ShutdownConnection(WalKeeper *wk);
 static void ResetConnection(WalKeeper *wk);
 static long TimeToReconnect(TimestampTz now);
 static void ReconnectWalKeepers(void);
-static void AdvancePollState(int i, uint32 events);
+static void AdvancePollState(WalKeeper *wk, uint32 events);
+static void HandleConnectionEvent(WalKeeper *wk);
+static void SendStartWALPush(WalKeeper *wk);
+static void RecvStartWALPushResult(WalKeeper *wk);
+static void SendProposerGreeting(WalKeeper *wk);
+static void RecvAcceptorGreeting(WalKeeper *wk);
+static void SendVoteRequest(WalKeeper *wk);
+static void RecvVoteResponse(WalKeeper *wk);
+static void HandleElectedProposer(void);
 static term_t GetHighestTerm(TermHistory *th);
 static term_t GetEpoch(WalKeeper *wk);
 static void DetermineEpochStartLsn(void);
@@ -122,6 +130,7 @@ static void SendMessageToNode(int i, WalMessage *msg);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
 static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void HandleActiveState(WalKeeper *wk, uint32 events);
 static bool SendAppendRequests(WalKeeper *wk);
 static bool RecvAppendResponses(WalKeeper *wk);
 static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
@@ -129,10 +138,9 @@ static XLogRecPtr CalculateDiskConsistentLsn(void);
 static XLogRecPtr CalculateMinFlushLsn(void);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
 static void HandleWalKeeperResponse(void);
-static bool AsyncRead(int i, char **buf, int *buf_size);
-static bool AsyncReadFixed(int i, void *value, size_t value_size);
-static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
-static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
+static bool AsyncRead(WalKeeper *wk, char **buf, int *buf_size);
+static bool AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg);
+static bool BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state);
 static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
 static bool AsyncFlush(WalKeeper *wk);
 
@@ -250,21 +258,19 @@ WalProposerPoll(void)
 	{
 		WalKeeper  *wk;
 		int			rc;
-		int			i;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
 
 		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
 							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
 		wk = (WalKeeper *) event.user_data;
-		i = (int) (wk - walkeeper);
 
 		/*
 		 * If the event contains something that one of our walkeeper states
 		 * was waiting for, we'll advance its state.
 		 */
 		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(i, event.events);
+			AdvancePollState(wk, event.events);
 
 		/*
 		 * If the timeout expired, attempt to reconnect to any walkeepers that
@@ -439,7 +445,7 @@ InitEventSet(void)
  *
  * This function is called any time the WAL keeper's state switches to one where
  * it has to wait to continue. This includes the full body of AdvancePollState
- * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
+ * and calls to IO helper functions.
  */
 static void
 UpdateEventSet(WalKeeper *wk, uint32 events)
@@ -640,453 +646,446 @@ ReconnectWalKeepers(void)
 }
 
 /*
- * Performs the logic for advancing the state machine of the 'i'th walkeeper,
+ * Performs the logic for advancing the state machine of the specified walkeeper,
  * given that a certain set of events has occured.
  */
 static void
-AdvancePollState(int i, uint32 events)
+AdvancePollState(WalKeeper *wk, uint32 events)
 {
-	WalKeeper  *wk = &walkeeper[i];
 	/*
-	 * Keep advancing the state while either: (a) the event is still
-	 * unprocessed (usually because it's the first iteration of the loop), or
-	 * (b) the state can execute, and does not need to wait for any socket
-	 * events
+	 * Sanity check. We assume further down that the operations don't
+	 * block because the socket is ready.
 	 */
-	while (events || StateShouldImmediatelyExecute(wk->state))
+	AssertEventsOkForState(events, wk);
+
+	/* Execute the code corresponding to the current state */
+	switch (wk->state)
 	{
-		/*
-		 * Sanity check. We assume further down that the operations don't
-		 * block because the socket is ready.
-		 */
-		AssertEventsOkForState(events, wk);
+			/*
+			 * WAL keepers are only taken out of SS_OFFLINE by calls to
+			 * ResetConnection
+			 */
+		case SS_OFFLINE:
+			elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
+					wk->host, wk->port);
+			break;			/* actually unreachable, but prevents
+							 * -Wimplicit-fallthrough */
 
-		/* Execute the code corresponding to the current state */
-		switch (wk->state)
-		{
-				/*
-				 * WAL keepers are only taken out of SS_OFFLINE by calls to
-				 * ResetConnection
-				 */
-			case SS_OFFLINE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
-					 wk->host, wk->port);
-				break;			/* actually unreachable, but prevents
-								 * -Wimplicit-fallthrough */
+			/*
+			 * Both connecting states run the same logic. The only
+			 * difference is the events they're expecting
+			 */
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
+			HandleConnectionEvent(wk);
+			break;
 
-				/*
-				 * Both connecting states run the same logic. The only
-				 * difference is the events they're expecting
-				 */
-			case SS_CONNECTING_READ:
-			case SS_CONNECTING_WRITE:
-				{
-					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
-
-					/* The new set of events we'll wait on, after updating */
-					uint32		new_events = WL_NO_EVENTS;
-
-					switch (result)
-					{
-						case WP_CONN_POLLING_OK:
-							elog(LOG, "connected with node %s:%s", wk->host,
-								 wk->port);
-
-							/*
-							 * Once we're fully connected, we can move to the
-							 * next state
-							 */
-							wk->state = SS_EXEC_STARTWALPUSH;
-
-							/*
-							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
-							 * on anything, we do need to replace the current
-							 * event, so we have to just pick something. We'll
-							 * eventually need the socket to be readable, so
-							 * we go with that.
-							 */
-							new_events = WL_SOCKET_READABLE;
-							break;
-
-							/*
-							 * If we need to poll to finish connecting,
-							 * continue doing that
-							 */
-						case WP_CONN_POLLING_READING:
-							wk->state = SS_CONNECTING_READ;
-							new_events = WL_SOCKET_READABLE;
-							break;
-						case WP_CONN_POLLING_WRITING:
-							wk->state = SS_CONNECTING_WRITE;
-							new_events = WL_SOCKET_WRITEABLE;
-							break;
-
-						case WP_CONN_POLLING_FAILED:
-							elog(WARNING, "Failed to connect to node '%s:%s': %s",
-								 wk->host, wk->port, walprop_error_message(wk->conn));
-
-							/*
-							 * If connecting failed, we don't want to restart
-							 * the connection because that might run us into a
-							 * loop. Instead, shut it down -- it'll naturally
-							 * restart at a slower interval on calls to
-							 * ReconnectWalKeepers.
-							 */
-							ShutdownConnection(wk);
-							return;
-					}
-
-					/*
-					 * Because PQconnectPoll can change the socket, we have to
-					 * un-register the old event and re-register an event on
-					 * the new socket.
-					 */
-					HackyRemoveWalProposerEvent(wk);
-					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
-					break;
-				}
+			/*
+			 * Waiting for a successful CopyBoth response.
+			 */
+		case SS_WAIT_EXEC_RESULT:
+			RecvStartWALPushResult(wk);
+			break;
 
-				/*
-				 * Send "START_WAL_PUSH" command to the walkeeper. After
-				 * sending, wait for response with SS_WAIT_EXEC_RESULT
-				 */
-			case SS_EXEC_STARTWALPUSH:
-				{
-					char *query = NULL;
-					if (zenith_pageserver_connstring_walproposer != NULL) {
-						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
-					} else {
-						query = psprintf("START_WAL_PUSH");
-					}
-					if (!walprop_send_query(wk->conn, query))
-					{
-						pfree(query);
-						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
-					}
-					pfree(query);
-					wk->state = SS_WAIT_EXEC_RESULT;
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-					break;
-				}
-
-			case SS_WAIT_EXEC_RESULT:
-				switch (walprop_get_query_result(wk->conn))
-				{
-						/*
-						 * Successful result, move on to starting the
-						 * handshake
-						 */
-					case WP_EXEC_SUCCESS_COPYBOTH:
-
-						/*
-						 * Because this state is immediately executable, we'll
-						 * start this on the next iteration of the loop
-						 */
-						wk->state = SS_HANDSHAKE_SEND;
-						break;
-
-						/*
-						 * Needs repeated calls to finish. Wait until the
-						 * socket is readable
-						 */
-					case WP_EXEC_NEEDS_INPUT:
-
-						/*
-						 * SS_WAIT_EXEC_RESULT is always reached through an
-						 * event, so we don't need to update the event set
-						 */
-						break;
-
-					case WP_EXEC_FAILED:
-						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
-
-						/*
-						 * Unexpected result -- funamdentally an error, but we
-						 * want to produce a custom message, rather than a
-						 * generic "something went wrong"
-						 */
-					case WP_EXEC_UNEXPECTED_SUCCESS:
-						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
-							 wk->host, wk->port);
-						ShutdownConnection(wk);
-						return;
-				}
-				break;
+			/*
+			 * Finish handshake comms: receive information about the safekeeper.
+			 */
+		case SS_HANDSHAKE_RECV:
+			RecvAcceptorGreeting(wk);
+			break;
 
-				/*
-				 * Start handshake: first of all send information about the
-				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
-				 * a response to finish the handshake.
-				 */
-			case SS_HANDSHAKE_SEND:
+			/*
+			 * Voting is an idle state - we don't expect any events to trigger.
+			 * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are
+			 * transferred from SS_VOTING to sending actual vote requests.
+			 */
+		case SS_VOTING:
+			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					wk->port, FormatWalKeeperState(wk->state));
+			ResetConnection(wk);
+			return;
 
-				/*
-				 * On failure, logging & resetting the connection is handled.
-				 * We just need to handle the control flow.
-				 */
-				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
-					return;
+			/* Read the safekeeper response for our candidate */
+		case SS_WAIT_VERDICT:
+			RecvVoteResponse(wk);
+			break;
 
-				break;
+			/* Flush proposer announcement message */
+		case SS_SEND_ELECTED_FLUSH:
 
-				/*
-				 * Finish handshake comms: receive information about the WAL
-				 * keeper
-				 */
-			case SS_HANDSHAKE_RECV:
+			/*
+			 * AsyncFlush ensures we only move on to SS_ACTIVE once the flush
+			 * completes. If we still have more to do, we'll wait until the next
+			 * poll comes along.
+			 */
+			if (!AsyncFlush(wk))
+				return;
+			
+			/* flush is done, event set and state will be updated later */
+			StartStreaming(wk);
+			break;
 
-				/*
-				 * If our reading doesn't immediately succeed, any necessary
-				 * error handling or state setting is taken care of. We can
-				 * leave any other work until later.
-				 */
-				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
-					return;
+			/*
+			 * Idle state for waiting votes from quorum.
+			 */
+		case SS_IDLE:
+			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					wk->port, FormatWalKeeperState(wk->state));
+			ResetConnection(wk);
+			return;
 
-				/* Protocol is all good, move to voting. */
-				wk->state = SS_VOTING;
+			/*
+			 * Active state is used for streaming WAL and receiving feedback.
+			 */
+		case SS_ACTIVE:
+			HandleActiveState(wk, events);
+			break;
+	}
+}
 
-				/*
-				 * Don't need to update the event set yet. Either we update
-				 * the event set to WL_SOCKET_READABLE *or* we change the
-				 * state to SS_SEND_VOTE in the loop below
-				 */
-				UpdateEventSet(wk, WL_SOCKET_READABLE);
-				wk->feedback.flushLsn = truncateLsn;
-				wk->feedback.hs.ts = 0;
+static void
+HandleConnectionEvent(WalKeeper *wk)
+{
+	WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
 
-				/*
-				 * We want our term to be highest and unique, so choose max
-				 * and +1 once we have majority.
-				 */
-				propTerm = Max(walkeeper[i].greet.term, propTerm);
+	/* The new set of events we'll wait on, after updating */
+	uint32		new_events = WL_NO_EVENTS;
 
-				/*
-				 * Check if we have quorum. If there aren't enough walkeepers,
-				 * wait and do nothing. We'll eventually get a task when the
-				 * election starts.
-				 *
-				 * If we do have quorum, we can start an election
-				 */
-				if (++n_connected < quorum)
-				{
-					/*
-					 * SS_VOTING is an idle state; read-ready indicates the
-					 * connection closed.
-					 */
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-				}
-				else
-				{
-					if (n_connected == quorum)
-					{
-						propTerm++;
-						/* prepare voting message */
-						voteRequest = (VoteRequest)
-						{
-							.tag = 'v',
-								.term = propTerm
-						};
-						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
-					}
-
-					/*
-					 * Now send voting request to the cohort and wait
-					 * responses
-					 */
-					for (int j = 0; j < n_walkeepers; j++)
-					{
-						/*
-						 * Remember: SS_VOTING indicates that the walkeeper is
-						 * participating in voting, but hasn't sent anything
-						 * yet. The ones that have sent something are given
-						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
-						 */
-						if (walkeeper[j].state == SS_VOTING)
-						{
-							walkeeper[j].state = SS_SEND_VOTE;
-							/* Immediately send info */
-							AdvancePollState(j, WL_NO_EVENTS);
-						}
-					}
-				}
-				break;
+	switch (result)
+	{
+		case WP_CONN_POLLING_OK:
+			elog(LOG, "connected with node %s:%s", wk->host,
+					wk->port);
 
-				/*
-				 * Voting is an idle state - we don't expect any events to
-				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
-				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
-				 */
-			case SS_VOTING:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+			/*
+			 * We have to pick some event to update event set.
+			 * We'll eventually need the socket to be readable,
+			 * so we go with that.
+			 */
+			new_events = WL_SOCKET_READABLE;
+			break;
+
+			/*
+			 * If we need to poll to finish connecting,
+			 * continue doing that
+			 */
+		case WP_CONN_POLLING_READING:
+			wk->state = SS_CONNECTING_READ;
+			new_events = WL_SOCKET_READABLE;
+			break;
+		case WP_CONN_POLLING_WRITING:
+			wk->state = SS_CONNECTING_WRITE;
+			new_events = WL_SOCKET_WRITEABLE;
+			break;
 
-				/* We have quorum for voting, send our vote request */
-			case SS_SEND_VOTE:
-				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
-				/* On failure, logging & resetting is handled */
-				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
-					return;
+		case WP_CONN_POLLING_FAILED:
+			elog(WARNING, "Failed to connect to node '%s:%s': %s",
+					wk->host, wk->port, walprop_error_message(wk->conn));
 
-				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
-				break;
+			/*
+			 * If connecting failed, we don't want to restart
+			 * the connection because that might run us into a
+			 * loop. Instead, shut it down -- it'll naturally
+			 * restart at a slower interval on calls to
+			 * ReconnectWalKeepers.
+			 */
+			ShutdownConnection(wk);
+			return;
+	}
 
-				/* Start reading the walkeeper response for our candidate */
-			case SS_WAIT_VERDICT:
-				wk->voteResponse.apm.tag = 'v';
-				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
-					return;
+	/*
+	 * Because PQconnectPoll can change the socket, we have to
+	 * un-register the old event and re-register an event on
+	 * the new socket.
+	 */
+	HackyRemoveWalProposerEvent(wk);
+	wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
 
-				elog(LOG,
-					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
-					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
-					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+	/* If we successfully connected, send START_WAL_PUSH query */
+	if (result == WP_CONN_POLLING_OK)
+		SendStartWALPush(wk);
+}
 
-				/*
-				 * In case of acceptor rejecting our vote, bail out, but only
-				 * if either it already lives in strictly higher term
-				 * (concurrent compute spotted) or we are not elected yet and
-				 * thus need the vote.
-				 */
-				if ((!wk->voteResponse.voteGiven) &&
-					(wk->voteResponse.term > propTerm || n_votes < quorum))
-				{
-					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-						 wk->host, wk->port,
-						 wk->voteResponse.term, propTerm);
-				}
-				Assert(wk->voteResponse.term == propTerm);
-
-				/* Handshake completed, do we have quorum? */
-				n_votes++;
-				if (n_votes < quorum)
-				{
-					wk->state = SS_IDLE; /* can't do much yet, no quorum */
-				}
-				else if (n_votes > quorum)
-				{
-
-					/* recovery already performed, just start streaming */
-					SendProposerElected(wk);
-				}
-				else
-				{
-					wk->state = SS_IDLE;
-					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
-															 * read-ready */
-
-					DetermineEpochStartLsn();
-
-					/*
-					 * Check if not all safekeepers are up-to-date, we need to
-					 * download WAL needed to synchronize them
-					 */
-					if (truncateLsn < propEpochStartLsn)
-					{
-						elog(LOG,
-							 "start recovery because truncateLsn=%X/%X is not "
-							 "equal to epochStartLsn=%X/%X",
-							 LSN_FORMAT_ARGS(truncateLsn),
-							 LSN_FORMAT_ARGS(propEpochStartLsn));
-						/* Perform recovery */
-						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
-							elog(FATAL, "Failed to recover state");
-					}
-					else if (syncSafekeepers)
-					{
-						/* Sync is not needed: just exit */
-						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-						exit(0);
-					}
-
-					for (int i = 0; i < n_walkeepers; i++)
-					{
-						if (walkeeper[i].state == SS_IDLE)
-							SendProposerElected(&walkeeper[i]);
-					}
-
-					/* 
-					 * The proposer has been elected, and there will be no quorum waiting
-					 * after this point. There will be no safekeeper with state SS_IDLE
-					 * also, because that state is used only for quorum waiting.
-					 */
-
-					if (syncSafekeepers)
-					{
-						/*
-						 * Queue empty message to enforce receiving feedback
-						 * even from nodes who are fully recovered; this is
-						 * required to learn they switched epoch which finishes
-						 * sync-safeekepers who doesn't generate any real new
-						 * records. Will go away once we switch to async acks.
-						 */
-						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
-
-						/* keep polling until all walkeepers are synced */
-						return;
-					}
-
-					WalProposerStartStreaming(propEpochStartLsn);
-					/* Should not return here */
-				}
+/*
+ * Send "START_WAL_PUSH" message as an empty query to the walkeeper. Performs
+ * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something
+ * goes wrong, change state to SS_OFFLINE and shutdown the connection.
+ */
+static void
+SendStartWALPush(WalKeeper *wk)
+{
+	char *query = NULL;
+	if (zenith_pageserver_connstring_walproposer != NULL) {
+		query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
+	} else {
+		query = psprintf("START_WAL_PUSH");
+	}
+	if (!walprop_send_query(wk->conn, query))
+	{
+		pfree(query);
+		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+			wk->host, wk->port, walprop_error_message(wk->conn));
+		ShutdownConnection(wk);
+		return;
+	}
+	pfree(query);
+	wk->state = SS_WAIT_EXEC_RESULT;
+	UpdateEventSet(wk, WL_SOCKET_READABLE);
+}
 
-				break;
+static void
+RecvStartWALPushResult(WalKeeper *wk)
+{
+	switch (walprop_get_query_result(wk->conn))
+	{
+			/*
+			 * Successful result, move on to starting the
+			 * handshake
+			 */
+		case WP_EXEC_SUCCESS_COPYBOTH:
 
-			/* Flush proposer announcement message */
-			case SS_SEND_ELECTED_FLUSH:
+			SendProposerGreeting(wk);
+			break;
 
-				/*
-				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll
-				 * wait until the next poll comes along.
-				 */
-				if (!AsyncFlush(wk))
-					return;
-				
-				StartStreaming(wk);
+			/*
+			 * Needs repeated calls to finish. Wait until the
+			 * socket is readable
+			 */
+		case WP_EXEC_NEEDS_INPUT:
 
-				break;
+			/*
+			 * SS_WAIT_EXEC_RESULT is always reached through an
+			 * event, so we don't need to update the event set
+			 */
+			break;
 
+		case WP_EXEC_FAILED:
+			elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
+					wk->host, wk->port, walprop_error_message(wk->conn));
+			ShutdownConnection(wk);
+			return;
 
-				/*
-				 * Idle state for sending WAL. Moved out only by calls to
-				 * SendMessageToNode
-				 */
-			case SS_IDLE:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+			/*
+			 * Unexpected result -- funamdentally an error, but we
+			 * want to produce a custom message, rather than a
+			 * generic "something went wrong"
+			 */
+		case WP_EXEC_UNEXPECTED_SUCCESS:
+			elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
+					wk->host, wk->port);
+			ShutdownConnection(wk);
+			return;
+	}
+}
+
+/*
+ * Start handshake: first of all send information about the
+ * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * a response to finish the handshake.
+ */
+static void
+SendProposerGreeting(WalKeeper *wk)
+{
+	/*
+	 * On failure, logging & resetting the connection is handled.
+	 * We just need to handle the control flow.
+	 */
+	BlockingWrite(wk, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV);
+}
 
+static void
+RecvAcceptorGreeting(WalKeeper *wk)
+{
+	/*
+	 * If our reading doesn't immediately succeed, any necessary
+	 * error handling or state setting is taken care of. We can
+	 * leave any other work until later.
+	 */
+	wk->greet.apm.tag = 'g';
+	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->greet))
+		return;
 
-			case SS_ACTIVE:
-				if (events & WL_SOCKET_WRITEABLE)
-					if (!SendAppendRequests(wk))
-						return;
+	/* Protocol is all good, move to voting. */
+	wk->state = SS_VOTING;
+	wk->feedback.flushLsn = truncateLsn;
+	wk->feedback.hs.ts = 0;
 
-				if (events & WL_SOCKET_READABLE)
-					if (!RecvAppendResponses(wk))
-						return;
+	/*
+	 * We want our term to be highest and unique, so choose max
+	 * and +1 once we have majority.
+	 */
+	propTerm = Max(wk->greet.term, propTerm);
 
-				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
-				break;
+	/*
+	 * Check if we have quorum. If there aren't enough safekeepers,
+	 * wait and do nothing. We'll eventually get a task when the
+	 * election starts.
+	 *
+	 * If we do have quorum, we can start an election
+	 */
+	if (++n_connected < quorum)
+	{
+		/*
+		 * SS_VOTING is an idle state; read-ready indicates the
+		 * connection closed.
+		 */
+		UpdateEventSet(wk, WL_SOCKET_READABLE);
+	}
+	else
+	{
+		if (n_connected == quorum)
+		{
+			propTerm++;
+			/* prepare voting message */
+			voteRequest = (VoteRequest)
+			{
+				.tag = 'v',
+					.term = propTerm
+			};
+			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
 		}
 
 		/*
-		 * We've already done something for these events - don't attempt more
-		 * states than we need to.
+		 * Now send voting request to the cohort and wait
+		 * responses
 		 */
-		events = WL_NO_EVENTS;
+		for (int j = 0; j < n_walkeepers; j++)
+		{
+			/*
+			 * Remember: SS_VOTING indicates that the safekeeper is
+			 * participating in voting, but hasn't sent anything
+			 * yet.
+			 */
+			if (walkeeper[j].state == SS_VOTING)
+				SendVoteRequest(&walkeeper[j]);
+		}
+	}
+}
+
+static void
+SendVoteRequest(WalKeeper *wk)
+{
+	/* We have quorum for voting, send our vote request */
+	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
+	/* On failure, logging & resetting is handled */
+	if (!BlockingWrite(wk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+		return;
+
+	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
+}
+
+static void
+RecvVoteResponse(WalKeeper *wk)
+{
+	wk->voteResponse.apm.tag = 'v';
+	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->voteResponse))
+		return;
+
+	elog(LOG,
+			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+			wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
+			LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
+			LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+
+	/*
+	 * In case of acceptor rejecting our vote, bail out, but only
+	 * if either it already lives in strictly higher term
+	 * (concurrent compute spotted) or we are not elected yet and
+	 * thus need the vote.
+	 */
+	if ((!wk->voteResponse.voteGiven) &&
+		(wk->voteResponse.term > propTerm || n_votes < quorum))
+	{
+		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+				wk->host, wk->port,
+				wk->voteResponse.term, propTerm);
+	}
+	Assert(wk->voteResponse.term == propTerm);
+
+	/* Handshake completed, do we have quorum? */
+	n_votes++;
+	if (n_votes < quorum)
+	{
+		wk->state = SS_IDLE; /* can't do much yet, no quorum */
+	}
+	else if (n_votes > quorum)
+	{
+		/* recovery already performed, just start streaming */
+		SendProposerElected(wk);
+	}
+	else
+	{
+		wk->state = SS_IDLE;
+		UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+												 * read-ready */
+
+		HandleElectedProposer();
+	}
+}
+
+/*
+ * Called once a majority of acceptors have voted for us and current proposer
+ * has been elected.
+ * 
+ * Sends ProposerElected message to all acceptors in SS_IDLE state and starts
+ * replication from walsender.
+ */
+static void
+HandleElectedProposer(void)
+{
+	DetermineEpochStartLsn();
+
+	/*
+	 * Check if not all safekeepers are up-to-date, we need to
+	 * download WAL needed to synchronize them
+	 */
+	if (truncateLsn < propEpochStartLsn)
+	{
+		elog(LOG,
+				"start recovery because truncateLsn=%X/%X is not "
+				"equal to epochStartLsn=%X/%X",
+				LSN_FORMAT_ARGS(truncateLsn),
+				LSN_FORMAT_ARGS(propEpochStartLsn));
+		/* Perform recovery */
+		if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
+			elog(FATAL, "Failed to recover state");
+	}
+	else if (syncSafekeepers)
+	{
+		/* Sync is not needed: just exit */
+		fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+		exit(0);
+	}
+
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE)
+			SendProposerElected(&walkeeper[i]);
+	}
+
+	/* 
+	 * The proposer has been elected, and there will be no quorum waiting
+	 * after this point. There will be no safekeeper with state SS_IDLE
+	 * also, because that state is used only for quorum waiting.
+	 */
+
+	if (syncSafekeepers)
+	{
+		/*
+			* Queue empty message to enforce receiving feedback
+			* even from nodes who are fully recovered; this is
+			* required to learn they switched epoch which finishes
+			* sync-safeekepers who doesn't generate any real new
+			* records. Will go away once we switch to async acks.
+			*/
+		BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+
+		/* keep polling until all walkeepers are synced */
+		return;
 	}
+
+	WalProposerStartStreaming(propEpochStartLsn);
+	/* Should not return here */
 }
 
 /* latest term in TermHistory, or 0 is there is no entries */
@@ -1372,7 +1371,8 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 }
 
 /*
- * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
+ * Start streaming to safekeeper wk, always updates state to SS_ACTIVE and sets
+ * correct event set.
  */
 static void
 StartStreaming(WalKeeper *wk)
@@ -1384,7 +1384,6 @@ StartStreaming(WalKeeper *wk)
 	 * exactly once for a connection.
 	 */
 	wk->state = SS_ACTIVE;
-	UpdateEventSet(wk, WL_SOCKET_READABLE);
 
 	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
@@ -1395,17 +1394,21 @@ StartStreaming(WalKeeper *wk)
 		}
 		else
 		{
+			/* event set will be updated inside SendMessageToNode */
 			SendMessageToNode(wki, msg);
 			return;
 		}
 	}
+
+	/* Call SS_ACTIVE handler to update event set */
+	HandleActiveState(wk, WL_NO_EVENTS);
 }
 
 /*
- * Start sending message to the particular node.
+ * Start sending message to the particular node. Always updates event set.
  *
- * Always updates the state and event set for the WAL keeper; setting either of
- * these before calling would be redundant work.
+ * Can be used only for safekeepers in SS_ACTIVE state. State can be changed
+ * in case of errors.
  */
 static void
 SendMessageToNode(int i, WalMessage *msg)
@@ -1414,6 +1417,7 @@ SendMessageToNode(int i, WalMessage *msg)
 
 	/* we shouldn't be already sending something */
 	Assert(wk->currMsg == NULL);
+	Assert(wk->state == SS_ACTIVE);
 
 	/*
 	 * Skip already acknowledged messages. Used after reconnection to get to
@@ -1423,11 +1427,9 @@ SendMessageToNode(int i, WalMessage *msg)
 		msg = msg->next;
 
 	wk->currMsg = msg;
-	wk->flushWrite = false;
 
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	if (!SendAppendRequests(wk))
-		return;
+	HandleActiveState(wk, WL_SOCKET_WRITEABLE);
 }
 
 /*
@@ -1527,9 +1529,40 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	return msg;
 }
 
+/*
+ * Process all events happened in SS_ACTIVE state, update event set after that.
+ */
+static void
+HandleActiveState(WalKeeper *wk, uint32 events)
+{
+	uint32 newEvents = WL_SOCKET_READABLE;
+
+	if (events & WL_SOCKET_WRITEABLE)
+		if (!SendAppendRequests(wk))
+			return;
+
+	if (events & WL_SOCKET_READABLE)
+		if (!RecvAppendResponses(wk))
+			return;
+
+	/*
+	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
+	 * in the buffer.
+	 * 
+	 * wk->currMsg checks if we have pending unsent messages. This check isn't
+	 * necessary now, because we always send queue messages immediately after
+	 * creation. But it's good to have it here in case we change this behavior
+	 * in the future.
+	 */
+	if (wk->currMsg != NULL || wk->flushWrite)
+		newEvents |= WL_SOCKET_WRITEABLE;
+
+	UpdateEventSet(wk, newEvents);
+}
+
 /*
  * Send queue messages starting from wk->currMsg until the end or non-writable
- * socket, whichever comes first.
+ * socket, whichever comes first. Caller should take care of updating event set.
  * 
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
@@ -1540,6 +1573,7 @@ SendAppendRequests(WalKeeper *wk)
 	int wki = wk - walkeeper;
 	WalMessage *msg;
 	AppendRequestHeader *req;
+	PGAsyncWriteResult writeResult;
 
 	if (wk->flushWrite)
 	{
@@ -1550,7 +1584,7 @@ SendAppendRequests(WalKeeper *wk)
 			 */
 			return wk->state == SS_ACTIVE;
 
-		wk->currMsg = wk->currMsg->next;
+		/* Event set will be updated in the end of HandleActiveState */
 		wk->flushWrite = false;
 	}
 
@@ -1604,24 +1638,39 @@ SendAppendRequests(WalKeeper *wk)
 		 * message is stored after the end of the WalMessage
 		 * struct, in the allocation for each msg
 		 */
-		if (!AsyncWrite(wk, req,
-						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
-						SS_ACTIVE))
-		{
-			if (req != &msg->req)
-				free(req);
-			if (wk->state == SS_ACTIVE)
-			{
-				wk->flushWrite = true;
-				return true;
-			}
-			return false;
-		}
+		writeResult = walprop_async_write(wk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
+		
+		/* Free up resources */
 		if (req != &msg->req)
 			free(req);
 
-		/* continue writing the next message */
+		/* Mark current message as sent, whatever the result is */
 		wk->currMsg = wk->currMsg->next;
+
+		switch (writeResult)
+		{
+			case PG_ASYNC_WRITE_SUCCESS:
+				/* Continue writing the next message */
+				break;
+
+			case PG_ASYNC_WRITE_TRY_FLUSH:
+				/*
+				 * We still need to call PQflush some more to finish the job.
+				 * Caller function will handle this by setting right event set.
+				 */
+				wk->flushWrite = true;
+				return true;
+
+			case PG_ASYNC_WRITE_FAIL:
+				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+					wk->host, wk->port, FormatWalKeeperState(wk->state),
+					walprop_error_message(wk->conn));
+				ShutdownConnection(wk);
+				return false;
+			default:
+				Assert(false);
+				return false;
+		}
 	}
 
 	return true;
@@ -1649,7 +1698,8 @@ RecvAppendResponses(WalKeeper *wk)
 		 * necessary error handling or state setting is taken care
 		 * of. We can leave any other work until later.
 		 */
-		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+		wk->feedback.apm.tag = 'a';
+		if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->feedback))
 			break;
 
 		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
@@ -1657,18 +1707,9 @@ RecvAppendResponses(WalKeeper *wk)
 		/*
 		 * We shouldn't read responses ahead of wk->currMsg, because that will
 		 * look like we are receiving responses for messages that haven't been
-		 * sent yet. This can happen when message was placed in a buffer in 
-		 * SendAppendRequests, but sent through a wire only with a flush inside
-		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 * sent yet.
 		 */
-		if (wk->ackMsg == wk->currMsg)
-		{
-			/* Couldn't happen without flush flag */
-			Assert(wk->flushWrite);
-
-			wk->currMsg = wk->currMsg->next;
-			wk->flushWrite = false;
-		}
+		Assert(wk->ackMsg != wk->currMsg);
 
 		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
 											* receiving of this
@@ -1911,10 +1952,8 @@ HandleWalKeeperResponse(void)
  * failure.
  */
 static bool
-AsyncRead(int i, char **buf, int *buf_size)
+AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
 {
-	WalKeeper  *wk = &walkeeper[i];
-
 	switch (walprop_async_read(wk->conn, buf, buf_size))
 	{
 		case PG_ASYNC_READ_SUCCESS:
@@ -1936,56 +1975,23 @@ AsyncRead(int i, char **buf, int *buf_size)
 }
 
 /*
- * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
- * returning whether the read was successful.
- *
+ * Read next message with known type into provided struct, by reading a CopyData
+ * block from the safekeeper's postgres connection, returning whether the read
+ * was successful.
+ * 
  * If the read needs more polling, we return 'false' and keep the state
  * unmodified, waiting until it becomes read-ready to try again. If it fully
  * failed, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncReadFixed(int i, void *value, size_t value_size)
+AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
 {
-	WalKeeper  *wk = &walkeeper[i];
-	char	   *buf = NULL;
-	int			buf_size = -1;
-
-	if (!(AsyncRead(i, &buf, &buf_size)))
-		return false;
-
-	/*
-	 * If we get here, the read was ok, but we still need to check it was the
-	 * right amount
-	 */
-	if ((size_t) buf_size != value_size)
-	{
-		elog(FATAL,
-			 "Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
-			 wk->host, wk->port,
-			 FormatWalKeeperState(wk->state),
-			 value_size, buf_size);
-	}
-
-	/* Copy the resulting info into place */
-	memcpy(value, buf, buf_size);
-
-	return true;
-}
-
-/*
- * Read next message with known type into provided struct. 
- * TODO: migrate AsyncReadFixed here for all messages
- */
-static bool
-AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
-{
-	WalKeeper  *wk = &walkeeper[i];
 	char *buf;
 	int buf_size;
 	uint64 tag;
 	StringInfoData s;
 
-	if (!(AsyncRead(i, &buf, &buf_size)))
+	if (!(AsyncRead(wk, &buf, &buf_size)))
 		return false;
 
 	/* parse it */
@@ -2004,6 +2010,14 @@ AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
 
 	switch (tag)
 	{
+		case 'g':
+		{
+			AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+			msg->term = pq_getmsgint64_le(&s);
+			pq_getmsgend(&s);
+			return true;
+		}
+
 		case 'v':
 		{
 			VoteResponse *msg = (VoteResponse *) anymsg;
@@ -2023,6 +2037,20 @@ AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
 			return true;
 		}
 
+		case 'a':
+		{
+			AppendResponse *msg = (AppendResponse *) anymsg;
+			msg->term = pq_getmsgint64_le(&s);
+			msg->flushLsn = pq_getmsgint64_le(&s);
+			msg->commitLsn = pq_getmsgint64_le(&s);
+			msg->diskConsistentLsn = pq_getmsgint64_le(&s);
+			msg->hs.ts = pq_getmsgint64_le(&s);
+			msg->hs.xmin.value = pq_getmsgint64_le(&s);
+			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+			pq_getmsgend(&s);
+			return true;
+		}
+
 		default:
 		{
 			Assert(false);
@@ -2038,9 +2066,8 @@ AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
  * single packet.
  */
 static bool
-BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
+BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state)
 {
-	WalKeeper  *wk = &walkeeper[i];
 	uint32		events;
 
 	if (!walprop_blocking_write(wk->conn, msg, msg_size))
@@ -2105,7 +2132,9 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
  * Flushes a previous call to AsyncWrite. This only needs to be called when the
  * socket becomes read or write ready *after* calling AsyncWrite.
  *
- * If flushing successfully completes returns true, otherwise false.
+ * If flushing successfully completes returns true, otherwise false. Event set
+ * is updated only if connection fails, otherwise caller should manually unset
+ * WL_SOCKET_WRITEABLE.
  */
 static bool
 AsyncFlush(WalKeeper *wk)
@@ -2119,7 +2148,7 @@ AsyncFlush(WalKeeper *wk)
 	switch (walprop_flush(wk->conn))
 	{
 		case 0:
-			UpdateEventSet(wk, WL_SOCKET_READABLE); /* flush is done, unset write interest */
+			/* flush is done */
 			return true;
 		case 1:
 			/* Nothing to do; try again when the socket's ready */
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index c61ab87db45..74ea1cfd5b1 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -48,24 +48,15 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_CONNECTING_WRITE:
 			return_val = "connecting";
 			break;
-		case SS_EXEC_STARTWALPUSH:
-			return_val = "sending 'START_WAL_PUSH' query";
-			break;
 		case SS_WAIT_EXEC_RESULT:
 			return_val = "receiving query result";
 			break;
-		case SS_HANDSHAKE_SEND:
-			return_val = "handshake (sending)";
-			break;
 		case SS_HANDSHAKE_RECV:
 			return_val = "handshake (receiving)";
 			break;
 		case SS_VOTING:
 			return_val = "voting";
 			break;
-		case SS_SEND_VOTE:
-			return_val = "sending vote";
-			break;
 		case SS_WAIT_VERDICT:
 			return_val = "wait-for-verdict";
 			break;
@@ -140,19 +131,6 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 			result = WL_SOCKET_READABLE;
 			break;
 
-		/* Most writing states don't require any socket conditions */
-		case SS_EXEC_STARTWALPUSH:
-		case SS_HANDSHAKE_SEND:
-		case SS_SEND_VOTE:
-			result = WL_NO_EVENTS;
-			break;
-		/* but flushing does require read- or write-ready */
-		case SS_SEND_ELECTED_FLUSH:
-		/* Active state does both reading and writing to the socket */
-		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-
 		/* Idle states use read-readiness as a sign that the connection has been
 		 * disconnected. */
 		case SS_VOTING:
@@ -160,6 +138,18 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 			result = WL_SOCKET_READABLE;
 			break;
 
+		/* 
+		 * Flush states require write-ready for flushing.
+		 * Active state does both reading and writing.
+		 * 
+		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
+		 * 	check wk->flushWrite here to set WL_SOCKET_WRITEABLE.
+		 */
+		case SS_SEND_ELECTED_FLUSH:
+		case SS_ACTIVE:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
 		/* The offline state expects no events. */
 		case SS_OFFLINE:
 			result = WL_NO_EVENTS;
@@ -169,16 +159,6 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 	return result;
 }
 
-/* Returns whether the WAL keeper state corresponds to something that should be
- * immediately executed -- i.e. it is not idle, and is not currently waiting. */
-bool
-StateShouldImmediatelyExecute(WalKeeperState state)
-{
-	/* This is actually pretty simple to determine. */
-	return WalKeeperStateDesiredEvents(state) == WL_NO_EVENTS
-		&& state != SS_OFFLINE;
-}
-
 /* Returns a human-readable string corresponding to the event set
  *
  * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
@@ -309,4 +289,4 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	enlargeStringInfo(buf, sizeof(uint64));
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
-}
\ No newline at end of file
+}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index ca27df2d19b..9506a6ee887 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -68,20 +68,12 @@ typedef enum
 } PGAsyncWriteResult;
 
 /*
- * WAL safekeeper state
+ * WAL safekeeper state, which is used to wait for some event.
  *
  * States are listed here in the order that they're executed.
  *
  * Most states, upon failure, will move back to SS_OFFLINE by calls to
  * ResetConnection or ShutdownConnection.
- *
- * Also note: In places we say that a state "immediately" moves to another. This
- * happens in states that only exist to execute program logic, so they run
- * exactly once (when moved into), without waiting for any socket conditions.
- *
- * For example, when we set a WalKeeper's state to SS_SEND_VOTE, we immediately
- * call AdvancePollState - during which the WalKeeper switches its state to
- * SS_WAIT_VERDICT.
  */
 typedef enum
 {
@@ -99,28 +91,18 @@ typedef enum
 	 * they execute when polled, but we have this distinction in order to
 	 * recreate the event set in HackyRemoveWalProposerEvent.
 	 *
-	 * After the connection is made, moves to SS_EXEC_STARTWALPUSH.
+	 * After the connection is made, "START_WAL_PUSH" query is sent.
 	 */
 	SS_CONNECTING_WRITE,
 	SS_CONNECTING_READ,
 
-	/*
-	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper.
-	 * Performs a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT.
-	 */
-	SS_EXEC_STARTWALPUSH,
 	/*
 	 * Waiting for the result of the "START_WAL_PUSH" command.
 	 *
-	 * After we get a successful result, moves to SS_HANDSHAKE_SEND.
+	 * After we get a successful result, sends handshake to safekeeper.
 	 */
 	SS_WAIT_EXEC_RESULT,
 
-	/*
-	 * Executing the sending half of the handshake. Performs the blocking send,
-	 * then immediately moves to SS_HANDSHAKE_RECV.
-	 */
-	SS_HANDSHAKE_SEND,
 	/*
 	 * Executing the receiving half of the handshake. After receiving, moves to
 	 * SS_VOTING.
@@ -128,32 +110,28 @@ typedef enum
 	SS_HANDSHAKE_RECV,
 
 	/*
-	 * Currently participating in voting, but a quorum hasn't yet been reached.
+	 * Waiting to participate in voting, but a quorum hasn't yet been reached.
 	 * This is an idle state - we do not expect AdvancePollState to be called.
 	 *
-	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of
-	 * SS_HANDSHAKE_RECV.
+	 * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
+	 * quorum of handshakes.
 	 */
 	SS_VOTING,
-	/*
-	 * Performs a blocking send of the assigned vote, then immediately moves to
-	 * SS_WAIT_VERDICT.
-	 */
-	SS_SEND_VOTE,
+
 	/*
 	 * Already sent voting information, waiting to receive confirmation from the
-	 * node. After receiving, moves to SS_IDLE.
+	 * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet.
 	 */
 	SS_WAIT_VERDICT,
 
-	/* need to flush ProposerAnnouncement */
+	/* Need to flush ProposerElected message. */
 	SS_SEND_ELECTED_FLUSH,
 
 	/*
 	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
 	 * read-ready, the connection has been closed.
 	 *
-	 * Moves to SS_ACTIVE only by calls to SendMessageToNode.
+	 * Moves to SS_ACTIVE only by call to StartStreaming.
 	 */
 	SS_IDLE,
 
@@ -195,7 +173,7 @@ typedef struct AcceptorProposerMessage
  */
 typedef struct AcceptorGreeting
 {
-	uint64		tag;
+	AcceptorProposerMessage apm;
 	term_t		term;
 } AcceptorGreeting;
 
@@ -306,11 +284,11 @@ typedef struct HotStandbyFeedback
  */
 typedef struct AppendResponse
 {
+	AcceptorProposerMessage apm;
 	/*
 	 * Current term of the safekeeper; if it is higher than proposer's, the
 	 * compute is out of date.
 	 */
-	uint64 tag;
 	term_t     term;
 	// TODO: add comment
 	XLogRecPtr flushLsn;
@@ -341,8 +319,8 @@ typedef struct WalKeeper
 	WalProposerConn*   conn;
 	StringInfoData outbuf;
 
-	bool               flushWrite;    /* set to true if we wrote currMsg, but still need to call AsyncFlush */
-	WalMessage*        currMsg;       /* message been send to the receiver */
+	bool               flushWrite;    /* set to true if we need to call AsyncFlush, to flush pending messages */
+	WalMessage*        currMsg;       /* message that wasn't sent yet or NULL, if we have nothing to send */
 	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
@@ -361,7 +339,6 @@ int        CompareLsn(const void *a, const void *b);
 char*      FormatWalKeeperState(WalKeeperState state);
 void       AssertEventsOkForState(uint32 events, WalKeeper* wk);
 uint32     WalKeeperStateDesiredEvents(WalKeeperState state);
-bool       StateShouldImmediatelyExecute(WalKeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);

From fe53b717bf372db06b95087027e697eaaf54a574 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 4 Jan 2022 13:47:33 +0300
Subject: [PATCH 087/214] Don't change propTerm after quorum is acquired (#107)

---
 src/backend/replication/walproposer.c | 61 +++++++++++++++++----------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 14f300d110b..d6ff8ef4afa 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -79,16 +79,23 @@ static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
 static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
 										 * walkeepers */
 static ProposerGreeting proposerGreeting;
+static VoteRequest voteRequest; /* Vote request for walkeeper */
 static WaitEventSet *waitEvents;
 static AppendResponse lastFeedback;
 /*
- *  minimal LSN which may be needed for recovery of some safekeeper,
+ *  Minimal LSN which may be needed for recovery of some safekeeper,
  *  record-aligned (first record which might not yet received by someone).
  */
 static XLogRecPtr truncateLsn;
-static VoteRequest voteRequest; /* Vote request for walkeeper */
+/*
+ * Term of the proposer. We want our term to be highest and unique,
+ * so we collect terms from safekeepers quorum, choose max and +1.
+ * After that our term is fixed and must not change. If we observe
+ * that some safekeeper has higher term, it means that we have another
+ * running compute, so we must stop immediately.
+ */
+static term_t propTerm;
 static TermHistory propTermHistory; /* term history of the proposer */
-static term_t propTerm;			/* term of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
 static term_t donorEpoch;		/* Most advanced acceptor epoch */
 static int	donor;				/* Most advanced acceptor */
@@ -910,20 +917,42 @@ RecvAcceptorGreeting(WalKeeper *wk)
 	wk->feedback.flushLsn = truncateLsn;
 	wk->feedback.hs.ts = 0;
 
-	/*
-	 * We want our term to be highest and unique, so choose max
-	 * and +1 once we have majority.
-	 */
-	propTerm = Max(wk->greet.term, propTerm);
+	++n_connected;
+	if (n_connected <= quorum)
+	{
+		/* We're still collecting terms from the majority. */
+		propTerm = Max(wk->greet.term, propTerm);
+
+		/* Quorum is acquried, prepare the vote request. */
+		if (n_connected == quorum)
+		{
+			propTerm++;
+			elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
+
+			voteRequest = (VoteRequest)
+			{
+				.tag = 'v',
+					.term = propTerm
+			};
+			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+		}
+	}
+	else if (wk->greet.term > propTerm)
+	{
+		/* Another compute with higher term is running. */	
+		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+				wk->host, wk->port,
+				wk->greet.term, propTerm);
+	}
 
 	/*
 	 * Check if we have quorum. If there aren't enough safekeepers,
 	 * wait and do nothing. We'll eventually get a task when the
 	 * election starts.
 	 *
-	 * If we do have quorum, we can start an election
+	 * If we do have quorum, we can start an election.
 	 */
-	if (++n_connected < quorum)
+	if (n_connected < quorum)
 	{
 		/*
 		 * SS_VOTING is an idle state; read-ready indicates the
@@ -933,18 +962,6 @@ RecvAcceptorGreeting(WalKeeper *wk)
 	}
 	else
 	{
-		if (n_connected == quorum)
-		{
-			propTerm++;
-			/* prepare voting message */
-			voteRequest = (VoteRequest)
-			{
-				.tag = 'v',
-					.term = propTerm
-			};
-			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
-		}
-
 		/*
 		 * Now send voting request to the cohort and wait
 		 * responses

From 24c53eefedeb5bd2202b20976e021c401cd9b0af Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 5 Jan 2022 13:36:00 +0300
Subject: [PATCH 088/214] walproposer renames (#116)

* Rename walkeeper to safekeeper

* Rename message variables as request/response
---
 .../libpqwalproposer/libpqwalproposer.c       |   4 +-
 src/backend/replication/walproposer.c         | 758 +++++++++---------
 src/backend/replication/walproposer_utils.c   |  24 +-
 src/include/replication/walproposer.h         |  46 +-
 src/tools/pgindent/typedefs.list              |   2 +-
 5 files changed, 417 insertions(+), 417 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 177c93eb85d..f6714c08128 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -267,7 +267,7 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	 *     -2 if an error occured
 	 *  (> 0) if it was successful; that value is the amount transferred.
 	 *
-	 * The protocol we use between walproposer and walkeeper means that we
+	 * The protocol we use between walproposer and safekeeper means that we
 	 * *usually* wouldn't expect to see that the copy is done, but this can
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
@@ -280,7 +280,7 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 		{
 			/*
 			 * If we get -1, it's probably because of a server error; the
-			 * walkeeper won't normally send a CopyDone message.
+			 * safekeeper won't normally send a CopyDone message.
 			 *
 			 * We can check PQgetResult to make sure that the server failed;
 			 * it'll always result in PGRES_FATAL_ERROR
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d6ff8ef4afa..6f89c23eb2f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -69,19 +69,19 @@ WalProposerFunctionsType *WalProposerFunctions = NULL;
 
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
-static int	n_walkeepers = 0;
+static int	n_safekeepers = 0;
 static int	quorum = 0;
-static WalKeeper walkeeper[MAX_WALKEEPERS];
+static Safekeeper safekeeper[MAX_SAFEKEEPERS];
 static WalMessage *msgQueueHead;
 static WalMessage *msgQueueTail;
 static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
 								 * this point */
 static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
-										 * walkeepers */
-static ProposerGreeting proposerGreeting;
-static VoteRequest voteRequest; /* Vote request for walkeeper */
+										 * safekeepers */
+static ProposerGreeting greetRequest;
+static VoteRequest voteRequest; /* Vote request for safekeeper */
 static WaitEventSet *waitEvents;
-static AppendResponse lastFeedback;
+static AppendResponse quorumFeedback;
 /*
  *  Minimal LSN which may be needed for recovery of some safekeeper,
  *  record-aligned (first record which might not yet received by someone).
@@ -111,45 +111,45 @@ static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
 static void WalProposerLoop(void);
 static void InitEventSet(void);
-static void UpdateEventSet(WalKeeper *wk, uint32 events);
-static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
-static void ShutdownConnection(WalKeeper *wk);
-static void ResetConnection(WalKeeper *wk);
+static void UpdateEventSet(Safekeeper *sk, uint32 events);
+static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
+static void ShutdownConnection(Safekeeper *sk);
+static void ResetConnection(Safekeeper *sk);
 static long TimeToReconnect(TimestampTz now);
-static void ReconnectWalKeepers(void);
-static void AdvancePollState(WalKeeper *wk, uint32 events);
-static void HandleConnectionEvent(WalKeeper *wk);
-static void SendStartWALPush(WalKeeper *wk);
-static void RecvStartWALPushResult(WalKeeper *wk);
-static void SendProposerGreeting(WalKeeper *wk);
-static void RecvAcceptorGreeting(WalKeeper *wk);
-static void SendVoteRequest(WalKeeper *wk);
-static void RecvVoteResponse(WalKeeper *wk);
+static void ReconnectSafekeepers(void);
+static void AdvancePollState(Safekeeper *sk, uint32 events);
+static void HandleConnectionEvent(Safekeeper *sk);
+static void SendStartWALPush(Safekeeper *sk);
+static void RecvStartWALPushResult(Safekeeper *sk);
+static void SendProposerGreeting(Safekeeper *sk);
+static void RecvAcceptorGreeting(Safekeeper *sk);
+static void SendVoteRequest(Safekeeper *sk);
+static void RecvVoteResponse(Safekeeper *sk);
 static void HandleElectedProposer(void);
 static term_t GetHighestTerm(TermHistory *th);
-static term_t GetEpoch(WalKeeper *wk);
+static term_t GetEpoch(Safekeeper *sk);
 static void DetermineEpochStartLsn(void);
 static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
-static void SendProposerElected(WalKeeper *wk);
+static void SendProposerElected(Safekeeper *sk);
 static void WalProposerStartStreaming(XLogRecPtr startpos);
-static void StartStreaming(WalKeeper *wk);
+static void StartStreaming(Safekeeper *sk);
 static void SendMessageToNode(int i, WalMessage *msg);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
 static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
-static void HandleActiveState(WalKeeper *wk, uint32 events);
-static bool SendAppendRequests(WalKeeper *wk);
-static bool RecvAppendResponses(WalKeeper *wk);
+static void HandleActiveState(Safekeeper *sk, uint32 events);
+static bool SendAppendRequests(Safekeeper *sk);
+static bool RecvAppendResponses(Safekeeper *sk);
 static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
 static XLogRecPtr CalculateDiskConsistentLsn(void);
 static XLogRecPtr CalculateMinFlushLsn(void);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
-static void HandleWalKeeperResponse(void);
-static bool AsyncRead(WalKeeper *wk, char **buf, int *buf_size);
-static bool AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg);
-static bool BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state);
-static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
-static bool AsyncFlush(WalKeeper *wk);
+static void HandleSafekeeperResponse(void);
+static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
+static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
+static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
+static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
+static bool AsyncFlush(Safekeeper *sk);
 
 /*
  * WAL proposer bgworker entry point.
@@ -263,27 +263,27 @@ WalProposerPoll(void)
 {
 	while (true)
 	{
-		WalKeeper  *wk;
+		Safekeeper  *sk;
 		int			rc;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
 
 		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
 							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		wk = (WalKeeper *) event.user_data;
+		sk = (Safekeeper *) event.user_data;
 
 		/*
-		 * If the event contains something that one of our walkeeper states
+		 * If the event contains something that one of our safekeeper states
 		 * was waiting for, we'll advance its state.
 		 */
 		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(wk, event.events);
+			AdvancePollState(sk, event.events);
 
 		/*
-		 * If the timeout expired, attempt to reconnect to any walkeepers that
+		 * If the timeout expired, attempt to reconnect to any safekeepers that
 		 * we dropped
 		 */
-		ReconnectWalKeepers();
+		ReconnectSafekeepers();
 
 		/*
 		 * If wait is terminated by latch set (walsenders' latch is set on
@@ -362,52 +362,52 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		sep = strchr(port, ',');
 		if (sep != NULL)
 			*sep++ = '\0';
-		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
+		if (n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			elog(FATAL, "Too many walkeepers");
+			elog(FATAL, "Too many safekeepers");
 		}
-		walkeeper[n_walkeepers].host = host;
-		walkeeper[n_walkeepers].port = port;
-		walkeeper[n_walkeepers].state = SS_OFFLINE;
-		walkeeper[n_walkeepers].conn = NULL;
+		safekeeper[n_safekeepers].host = host;
+		safekeeper[n_safekeepers].port = port;
+		safekeeper[n_safekeepers].state = SS_OFFLINE;
+		safekeeper[n_safekeepers].conn = NULL;
 
 		/*
 		 * Set conninfo to empty. We'll fill it out once later, in
 		 * `ResetConnection` as needed
 		 */
-		walkeeper[n_walkeepers].conninfo[0] = '\0';
-		initStringInfo(&walkeeper[n_walkeepers].outbuf);
-		walkeeper[n_walkeepers].flushWrite = false;
-		walkeeper[n_walkeepers].currMsg = NULL;
-		walkeeper[n_walkeepers].ackMsg = NULL;
-		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
-		n_walkeepers += 1;
+		safekeeper[n_safekeepers].conninfo[0] = '\0';
+		initStringInfo(&safekeeper[n_safekeepers].outbuf);
+		safekeeper[n_safekeepers].flushWrite = false;
+		safekeeper[n_safekeepers].currMsg = NULL;
+		safekeeper[n_safekeepers].ackMsg = NULL;
+		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
+		n_safekeepers += 1;
 	}
-	if (n_walkeepers < 1)
+	if (n_safekeepers < 1)
 	{
-		elog(FATAL, "WalKeepers addresses are not specified");
+		elog(FATAL, "Safekeepers addresses are not specified");
 	}
-	quorum = n_walkeepers / 2 + 1;
+	quorum = n_safekeepers / 2 + 1;
 
 	/* Fill the greeting package */
-	proposerGreeting.tag = 'g';
-	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
-	proposerGreeting.pgVersion = PG_VERSION_NUM;
-	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
-	proposerGreeting.systemId = systemId;
+	greetRequest.tag = 'g';
+	greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
+	greetRequest.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
+	greetRequest.systemId = systemId;
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
+		!HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
+		!HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
 
-	proposerGreeting.timeline = ThisTimeLineID;
-	proposerGreeting.walSegSize = wal_segment_size;
+	greetRequest.timeline = ThisTimeLineID;
+	greetRequest.walSegSize = wal_segment_size;
 
 	InitEventSet();
 }
@@ -416,10 +416,10 @@ static void
 WalProposerStart(void)
 {
 
-	/* Initiate connections to all walkeeper nodes */
-	for (int i = 0; i < n_walkeepers; i++)
+	/* Initiate connections to all safekeeper nodes */
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		ResetConnection(&walkeeper[i]);
+		ResetConnection(&safekeeper[i]);
 	}
 
 	WalProposerLoop();
@@ -439,7 +439,7 @@ InitEventSet(void)
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
@@ -447,20 +447,20 @@ InitEventSet(void)
 }
 
 /*
- * Updates the events we're already waiting on for the WAL keeper, setting it to
+ * Updates the events we're already waiting on for the safekeeper, setting it to
  * the provided `events`
  *
- * This function is called any time the WAL keeper's state switches to one where
+ * This function is called any time the safekeeper's state switches to one where
  * it has to wait to continue. This includes the full body of AdvancePollState
  * and calls to IO helper functions.
  */
 static void
-UpdateEventSet(WalKeeper *wk, uint32 events)
+UpdateEventSet(Safekeeper *sk, uint32 events)
 {
 	/* eventPos = -1 when we don't have an event */
-	Assert(wk->eventPos != -1);
+	Assert(sk->eventPos != -1);
 
-	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }
 
 /* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
@@ -468,7 +468,7 @@ UpdateEventSet(WalKeeper *wk, uint32 events)
  * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
  */
 static void
-HackyRemoveWalProposerEvent(WalKeeper *to_remove)
+HackyRemoveWalProposerEvent(Safekeeper *to_remove)
 {
 	/* Remove the existing event set */
 	if (waitEvents)
@@ -476,50 +476,50 @@ HackyRemoveWalProposerEvent(WalKeeper *to_remove)
 		FreeWaitEventSet(waitEvents);
 		waitEvents = NULL;
 	}
-	/* Re-initialize it without adding any walkeeper events */
+	/* Re-initialize it without adding any safekeeper events */
 	InitEventSet();
 
 	/*
-	 * loop through the existing walkeepers. If they aren't the one we're
+	 * loop through the existing safekeepers. If they aren't the one we're
 	 * removing, and if they have a socket we can use, re-add the applicable
 	 * events.
 	 */
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
 		uint32		desired_events = WL_NO_EVENTS;
-		WalKeeper  *wk = &walkeeper[i];
+		Safekeeper  *sk = &safekeeper[i];
 
-		wk->eventPos = -1;
+		sk->eventPos = -1;
 
-		if (wk == to_remove)
+		if (sk == to_remove)
 			continue;
 
-		/* If this WAL keeper isn't offline, add an event for it! */
-		if (wk->conn != NULL)
+		/* If this safekeeper isn't offline, add an event for it! */
+		if (sk->conn != NULL)
 		{
-			desired_events = WalKeeperStateDesiredEvents(wk->state);
-			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
+			desired_events = SafekeeperStateDesiredEvents(sk->state);
+			sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk);
 		}
 	}
 }
 
-/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
+/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
 static void
-ShutdownConnection(WalKeeper *wk)
+ShutdownConnection(Safekeeper *sk)
 {
-	if (wk->conn)
-		walprop_finish(wk->conn);
-	wk->conn = NULL;
-	wk->state = SS_OFFLINE;
-	wk->flushWrite = false;
-	wk->currMsg = NULL;
-	wk->ackMsg = NULL;
-
-	if (wk->voteResponse.termHistory.entries)
-		pfree(wk->voteResponse.termHistory.entries);
-	wk->voteResponse.termHistory.entries = NULL;
-
-	HackyRemoveWalProposerEvent(wk);
+	if (sk->conn)
+		walprop_finish(sk->conn);
+	sk->conn = NULL;
+	sk->state = SS_OFFLINE;
+	sk->flushWrite = false;
+	sk->currMsg = NULL;
+	sk->ackMsg = NULL;
+
+	if (sk->voteResponse.termHistory.entries)
+		pfree(sk->voteResponse.termHistory.entries);
+	sk->voteResponse.termHistory.entries = NULL;
+
+	HackyRemoveWalProposerEvent(sk);
 }
 
 /*
@@ -529,13 +529,13 @@ ShutdownConnection(WalKeeper *wk)
  * On success, sets the state to SS_CONNECTING_WRITE.
  */
 static void
-ResetConnection(WalKeeper *wk)
+ResetConnection(Safekeeper *sk)
 {
 	pgsocket	sock;			/* socket of the new connection */
 
-	if (wk->state != SS_OFFLINE)
+	if (sk->state != SS_OFFLINE)
 	{
-		ShutdownConnection(wk);
+		ShutdownConnection(sk);
 	}
 
 	/*
@@ -544,25 +544,25 @@ ResetConnection(WalKeeper *wk)
 	 * If the connection information hasn't been filled out, we need to do
 	 * that here.
 	 */
-	if (wk->conninfo[0] == '\0')
+	if (sk->conninfo[0] == '\0')
 	{
 		int written = 0;
-		written = snprintf((char *) &wk->conninfo, MAXCONNINFO,
+		written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
 				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+				sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 		// currently connection string is not that long, but once we pass something like jwt we might overflow the buffer,
 		// so it is better to be defensive and check that everything aligns well
 		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not create connection string for walkeeper %s:%s", wk->host, wk->port);
+			elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 	}
 
-	wk->conn = walprop_connect_start((char *) &wk->conninfo);
+	sk->conn = walprop_connect_start((char *) &sk->conninfo);
 
 	/*
 	 * "If the result is null, then libpq has been unable to allocate a new
 	 * PGconn structure"
 	 */
-	if (!wk->conn)
+	if (!sk->conn)
 		elog(FATAL, "failed to allocate new PGconn object");
 
 	/*
@@ -570,7 +570,7 @@ ResetConnection(WalKeeper *wk)
 	 * PQconnectPoll. Before we do that though, we need to check that it
 	 * didn't immediately fail.
 	 */
-	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
+	if (walprop_status(sk->conn) == WP_CONNECTION_BAD)
 	{
 		/*---
 		 * According to libpq docs:
@@ -581,14 +581,14 @@ ResetConnection(WalKeeper *wk)
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
 		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
-			 wk->conninfo, walprop_error_message(wk->conn));
+			 sk->conninfo, walprop_error_message(sk->conn));
 
 		/*
 		 * Even though the connection failed, we still need to clean up the
 		 * object
 		 */
-		walprop_finish(wk->conn);
-		wk->conn = NULL;
+		walprop_finish(sk->conn);
+		sk->conn = NULL;
 		return;
 	}
 
@@ -605,12 +605,12 @@ ResetConnection(WalKeeper *wk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
+	elog(LOG, "Connecting with node %s:%s", sk->host, sk->port);
 
-	wk->state = SS_CONNECTING_WRITE;
+	sk->state = SS_CONNECTING_WRITE;
 
-	sock = walprop_socket(wk->conn);
-	wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, wk);
+	sock = walprop_socket(sk->conn);
+	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
 	return;
 }
 
@@ -635,46 +635,46 @@ TimeToReconnect(TimestampTz now)
 	return (long) (till_reconnect / 1000);
 }
 
-/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
+/* If the timeout has expired, attempt to reconnect to all offline safekeepers */
 static void
-ReconnectWalKeepers(void)
+ReconnectSafekeepers(void)
 {
 	TimestampTz now = GetCurrentTimestamp();
 
 	if (TimeToReconnect(now) == 0)
 	{
 		last_reconnect_attempt = now;
-		for (int i = 0; i < n_walkeepers; i++)
+		for (int i = 0; i < n_safekeepers; i++)
 		{
-			if (walkeeper[i].state == SS_OFFLINE)
-				ResetConnection(&walkeeper[i]);
+			if (safekeeper[i].state == SS_OFFLINE)
+				ResetConnection(&safekeeper[i]);
 		}
 	}
 }
 
 /*
- * Performs the logic for advancing the state machine of the specified walkeeper,
+ * Performs the logic for advancing the state machine of the specified safekeeper,
  * given that a certain set of events has occured.
  */
 static void
-AdvancePollState(WalKeeper *wk, uint32 events)
+AdvancePollState(Safekeeper *sk, uint32 events)
 {
 	/*
 	 * Sanity check. We assume further down that the operations don't
 	 * block because the socket is ready.
 	 */
-	AssertEventsOkForState(events, wk);
+	AssertEventsOkForState(events, sk);
 
 	/* Execute the code corresponding to the current state */
-	switch (wk->state)
+	switch (sk->state)
 	{
 			/*
-			 * WAL keepers are only taken out of SS_OFFLINE by calls to
+			 * safekeepers are only taken out of SS_OFFLINE by calls to
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
-					wk->host, wk->port);
+			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
+					sk->host, sk->port);
 			break;			/* actually unreachable, but prevents
 							 * -Wimplicit-fallthrough */
 
@@ -684,21 +684,21 @@ AdvancePollState(WalKeeper *wk, uint32 events)
 			 */
 		case SS_CONNECTING_READ:
 		case SS_CONNECTING_WRITE:
-			HandleConnectionEvent(wk);
+			HandleConnectionEvent(sk);
 			break;
 
 			/*
 			 * Waiting for a successful CopyBoth response.
 			 */
 		case SS_WAIT_EXEC_RESULT:
-			RecvStartWALPushResult(wk);
+			RecvStartWALPushResult(sk);
 			break;
 
 			/*
 			 * Finish handshake comms: receive information about the safekeeper.
 			 */
 		case SS_HANDSHAKE_RECV:
-			RecvAcceptorGreeting(wk);
+			RecvAcceptorGreeting(sk);
 			break;
 
 			/*
@@ -707,14 +707,14 @@ AdvancePollState(WalKeeper *wk, uint32 events)
 			 * transferred from SS_VOTING to sending actual vote requests.
 			 */
 		case SS_VOTING:
-			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					wk->port, FormatWalKeeperState(wk->state));
-			ResetConnection(wk);
+			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+					sk->port, FormatSafekeeperState(sk->state));
+			ResetConnection(sk);
 			return;
 
 			/* Read the safekeeper response for our candidate */
 		case SS_WAIT_VERDICT:
-			RecvVoteResponse(wk);
+			RecvVoteResponse(sk);
 			break;
 
 			/* Flush proposer announcement message */
@@ -725,35 +725,35 @@ AdvancePollState(WalKeeper *wk, uint32 events)
 			 * completes. If we still have more to do, we'll wait until the next
 			 * poll comes along.
 			 */
-			if (!AsyncFlush(wk))
+			if (!AsyncFlush(sk))
 				return;
 			
 			/* flush is done, event set and state will be updated later */
-			StartStreaming(wk);
+			StartStreaming(sk);
 			break;
 
 			/*
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					wk->port, FormatWalKeeperState(wk->state));
-			ResetConnection(wk);
+			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+					sk->port, FormatSafekeeperState(sk->state));
+			ResetConnection(sk);
 			return;
 
 			/*
 			 * Active state is used for streaming WAL and receiving feedback.
 			 */
 		case SS_ACTIVE:
-			HandleActiveState(wk, events);
+			HandleActiveState(sk, events);
 			break;
 	}
 }
 
 static void
-HandleConnectionEvent(WalKeeper *wk)
+HandleConnectionEvent(Safekeeper *sk)
 {
-	WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
+	WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn);
 
 	/* The new set of events we'll wait on, after updating */
 	uint32		new_events = WL_NO_EVENTS;
@@ -761,8 +761,8 @@ HandleConnectionEvent(WalKeeper *wk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			elog(LOG, "connected with node %s:%s", wk->host,
-					wk->port);
+			elog(LOG, "connected with node %s:%s", sk->host,
+					sk->port);
 
 			/*
 			 * We have to pick some event to update event set.
@@ -777,26 +777,26 @@ HandleConnectionEvent(WalKeeper *wk)
 			 * continue doing that
 			 */
 		case WP_CONN_POLLING_READING:
-			wk->state = SS_CONNECTING_READ;
+			sk->state = SS_CONNECTING_READ;
 			new_events = WL_SOCKET_READABLE;
 			break;
 		case WP_CONN_POLLING_WRITING:
-			wk->state = SS_CONNECTING_WRITE;
+			sk->state = SS_CONNECTING_WRITE;
 			new_events = WL_SOCKET_WRITEABLE;
 			break;
 
 		case WP_CONN_POLLING_FAILED:
 			elog(WARNING, "Failed to connect to node '%s:%s': %s",
-					wk->host, wk->port, walprop_error_message(wk->conn));
+					sk->host, sk->port, walprop_error_message(sk->conn));
 
 			/*
 			 * If connecting failed, we don't want to restart
 			 * the connection because that might run us into a
 			 * loop. Instead, shut it down -- it'll naturally
 			 * restart at a slower interval on calls to
-			 * ReconnectWalKeepers.
+			 * ReconnectSafekeepers.
 			 */
-			ShutdownConnection(wk);
+			ShutdownConnection(sk);
 			return;
 	}
 
@@ -805,21 +805,21 @@ HandleConnectionEvent(WalKeeper *wk)
 	 * un-register the old event and re-register an event on
 	 * the new socket.
 	 */
-	HackyRemoveWalProposerEvent(wk);
-	wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
+	HackyRemoveWalProposerEvent(sk);
+	sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk);
 
 	/* If we successfully connected, send START_WAL_PUSH query */
 	if (result == WP_CONN_POLLING_OK)
-		SendStartWALPush(wk);
+		SendStartWALPush(sk);
 }
 
 /*
- * Send "START_WAL_PUSH" message as an empty query to the walkeeper. Performs
+ * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs
  * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something
  * goes wrong, change state to SS_OFFLINE and shutdown the connection.
  */
 static void
-SendStartWALPush(WalKeeper *wk)
+SendStartWALPush(Safekeeper *sk)
 {
 	char *query = NULL;
 	if (zenith_pageserver_connstring_walproposer != NULL) {
@@ -827,23 +827,23 @@ SendStartWALPush(WalKeeper *wk)
 	} else {
 		query = psprintf("START_WAL_PUSH");
 	}
-	if (!walprop_send_query(wk->conn, query))
+	if (!walprop_send_query(sk->conn, query))
 	{
 		pfree(query);
-		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-			wk->host, wk->port, walprop_error_message(wk->conn));
-		ShutdownConnection(wk);
+		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			sk->host, sk->port, walprop_error_message(sk->conn));
+		ShutdownConnection(sk);
 		return;
 	}
 	pfree(query);
-	wk->state = SS_WAIT_EXEC_RESULT;
-	UpdateEventSet(wk, WL_SOCKET_READABLE);
+	sk->state = SS_WAIT_EXEC_RESULT;
+	UpdateEventSet(sk, WL_SOCKET_READABLE);
 }
 
 static void
-RecvStartWALPushResult(WalKeeper *wk)
+RecvStartWALPushResult(Safekeeper *sk)
 {
-	switch (walprop_get_query_result(wk->conn))
+	switch (walprop_get_query_result(sk->conn))
 	{
 			/*
 			 * Successful result, move on to starting the
@@ -851,7 +851,7 @@ RecvStartWALPushResult(WalKeeper *wk)
 			 */
 		case WP_EXEC_SUCCESS_COPYBOTH:
 
-			SendProposerGreeting(wk);
+			SendProposerGreeting(sk);
 			break;
 
 			/*
@@ -867,9 +867,9 @@ RecvStartWALPushResult(WalKeeper *wk)
 			break;
 
 		case WP_EXEC_FAILED:
-			elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-					wk->host, wk->port, walprop_error_message(wk->conn));
-			ShutdownConnection(wk);
+			elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
+					sk->host, sk->port, walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
 			return;
 
 			/*
@@ -878,50 +878,50 @@ RecvStartWALPushResult(WalKeeper *wk)
 			 * generic "something went wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
-					wk->host, wk->port);
-			ShutdownConnection(wk);
+			elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
+					sk->host, sk->port);
+			ShutdownConnection(sk);
 			return;
 	}
 }
 
 /*
  * Start handshake: first of all send information about the
- * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
  * a response to finish the handshake.
  */
 static void
-SendProposerGreeting(WalKeeper *wk)
+SendProposerGreeting(Safekeeper *sk)
 {
 	/*
 	 * On failure, logging & resetting the connection is handled.
 	 * We just need to handle the control flow.
 	 */
-	BlockingWrite(wk, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV);
+	BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV);
 }
 
 static void
-RecvAcceptorGreeting(WalKeeper *wk)
+RecvAcceptorGreeting(Safekeeper *sk)
 {
 	/*
 	 * If our reading doesn't immediately succeed, any necessary
 	 * error handling or state setting is taken care of. We can
 	 * leave any other work until later.
 	 */
-	wk->greet.apm.tag = 'g';
-	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->greet))
+	sk->greetResponse.apm.tag = 'g';
+	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
 	/* Protocol is all good, move to voting. */
-	wk->state = SS_VOTING;
-	wk->feedback.flushLsn = truncateLsn;
-	wk->feedback.hs.ts = 0;
+	sk->state = SS_VOTING;
+	sk->appendResponse.flushLsn = truncateLsn;
+	sk->appendResponse.hs.ts = 0;
 
 	++n_connected;
 	if (n_connected <= quorum)
 	{
 		/* We're still collecting terms from the majority. */
-		propTerm = Max(wk->greet.term, propTerm);
+		propTerm = Max(sk->greetResponse.term, propTerm);
 
 		/* Quorum is acquried, prepare the vote request. */
 		if (n_connected == quorum)
@@ -934,15 +934,15 @@ RecvAcceptorGreeting(WalKeeper *wk)
 				.tag = 'v',
 					.term = propTerm
 			};
-			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+			memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN);
 		}
 	}
-	else if (wk->greet.term > propTerm)
+	else if (sk->greetResponse.term > propTerm)
 	{
 		/* Another compute with higher term is running. */	
 		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-				wk->host, wk->port,
-				wk->greet.term, propTerm);
+				sk->host, sk->port,
+				sk->greetResponse.term, propTerm);
 	}
 
 	/*
@@ -958,7 +958,7 @@ RecvAcceptorGreeting(WalKeeper *wk)
 		 * SS_VOTING is an idle state; read-ready indicates the
 		 * connection closed.
 		 */
-		UpdateEventSet(wk, WL_SOCKET_READABLE);
+		UpdateEventSet(sk, WL_SOCKET_READABLE);
 	}
 	else
 	{
@@ -966,43 +966,43 @@ RecvAcceptorGreeting(WalKeeper *wk)
 		 * Now send voting request to the cohort and wait
 		 * responses
 		 */
-		for (int j = 0; j < n_walkeepers; j++)
+		for (int j = 0; j < n_safekeepers; j++)
 		{
 			/*
 			 * Remember: SS_VOTING indicates that the safekeeper is
 			 * participating in voting, but hasn't sent anything
 			 * yet.
 			 */
-			if (walkeeper[j].state == SS_VOTING)
-				SendVoteRequest(&walkeeper[j]);
+			if (safekeeper[j].state == SS_VOTING)
+				SendVoteRequest(&safekeeper[j]);
 		}
 	}
 }
 
 static void
-SendVoteRequest(WalKeeper *wk)
+SendVoteRequest(Safekeeper *sk)
 {
 	/* We have quorum for voting, send our vote request */
-	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
+	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
 	/* On failure, logging & resetting is handled */
-	if (!BlockingWrite(wk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+	if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
 		return;
 
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 }
 
 static void
-RecvVoteResponse(WalKeeper *wk)
+RecvVoteResponse(Safekeeper *sk)
 {
-	wk->voteResponse.apm.tag = 'v';
-	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->voteResponse))
+	sk->voteResponse.apm.tag = 'v';
+	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;
 
 	elog(LOG,
 			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-			wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
-			LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
-			LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+			sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+			LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only
@@ -1010,30 +1010,30 @@ RecvVoteResponse(WalKeeper *wk)
 	 * (concurrent compute spotted) or we are not elected yet and
 	 * thus need the vote.
 	 */
-	if ((!wk->voteResponse.voteGiven) &&
-		(wk->voteResponse.term > propTerm || n_votes < quorum))
+	if ((!sk->voteResponse.voteGiven) &&
+		(sk->voteResponse.term > propTerm || n_votes < quorum))
 	{
 		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-				wk->host, wk->port,
-				wk->voteResponse.term, propTerm);
+				sk->host, sk->port,
+				sk->voteResponse.term, propTerm);
 	}
-	Assert(wk->voteResponse.term == propTerm);
+	Assert(sk->voteResponse.term == propTerm);
 
 	/* Handshake completed, do we have quorum? */
 	n_votes++;
 	if (n_votes < quorum)
 	{
-		wk->state = SS_IDLE; /* can't do much yet, no quorum */
+		sk->state = SS_IDLE; /* can't do much yet, no quorum */
 	}
 	else if (n_votes > quorum)
 	{
 		/* recovery already performed, just start streaming */
-		SendProposerElected(wk);
+		SendProposerElected(sk);
 	}
 	else
 	{
-		wk->state = SS_IDLE;
-		UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+		sk->state = SS_IDLE;
+		UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for
 												 * read-ready */
 
 		HandleElectedProposer();
@@ -1064,7 +1064,7 @@ HandleElectedProposer(void)
 				LSN_FORMAT_ARGS(truncateLsn),
 				LSN_FORMAT_ARGS(propEpochStartLsn));
 		/* Perform recovery */
-		if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
+		if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
 			elog(FATAL, "Failed to recover state");
 	}
 	else if (syncSafekeepers)
@@ -1074,10 +1074,10 @@ HandleElectedProposer(void)
 		exit(0);
 	}
 
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE)
-			SendProposerElected(&walkeeper[i]);
+		if (safekeeper[i].state == SS_IDLE)
+			SendProposerElected(&safekeeper[i]);
 	}
 
 	/* 
@@ -1097,7 +1097,7 @@ HandleElectedProposer(void)
 			*/
 		BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
-		/* keep polling until all walkeepers are synced */
+		/* keep polling until all safekeepers are synced */
 		return;
 	}
 
@@ -1114,9 +1114,9 @@ GetHighestTerm(TermHistory *th)
 
 /* safekeeper's epoch is the term of the highest entry in the log */
 static term_t
-GetEpoch(WalKeeper *wk)
+GetEpoch(Safekeeper *sk)
 {
-	return GetHighestTerm(&wk->voteResponse.termHistory);
+	return GetHighestTerm(&sk->voteResponse.termHistory);
 }
 
 /*
@@ -1136,19 +1136,19 @@ DetermineEpochStartLsn(void)
 	donorEpoch = 0;
 	truncateLsn = InvalidXLogRecPtr;
 
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE)
+		if (safekeeper[i].state == SS_IDLE)
 		{
-			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
-				(GetEpoch(&walkeeper[i]) == donorEpoch &&
-				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
+			if (GetEpoch(&safekeeper[i]) > donorEpoch ||
+				(GetEpoch(&safekeeper[i]) == donorEpoch &&
+				 safekeeper[i].voteResponse.flushLsn > propEpochStartLsn))
 			{
-				donorEpoch = GetEpoch(&walkeeper[i]);
-				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
+				donorEpoch = GetEpoch(&safekeeper[i]);
+				propEpochStartLsn = safekeeper[i].voteResponse.flushLsn;
 				donor = i;
 			}
-			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
+			truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn);
 		}
 	}
 
@@ -1177,7 +1177,7 @@ DetermineEpochStartLsn(void)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &walkeeper[donor].voteResponse.termHistory;
+	dth = &safekeeper[donor].voteResponse.termHistory;
 	propTermHistory.n_entries = dth->n_entries + 1;
 	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
 	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
@@ -1188,13 +1188,13 @@ DetermineEpochStartLsn(void)
 		 quorum,
 		 propTerm,
 		 LSN_FORMAT_ARGS(propEpochStartLsn),
-		 walkeeper[donor].host, walkeeper[donor].port,
+		 safekeeper[donor].host, safekeeper[donor].port,
 		 LSN_FORMAT_ARGS(truncateLsn)
 		);
 }
 
 /*
- * Receive WAL from most advanced WAL keeper
+ * Receive WAL from most advanced safekeeper
  */
 static bool
 WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
@@ -1205,20 +1205,20 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	WalRcvStreamOptions options;
 
 	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+			safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
 				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						walkeeper[donor].host, walkeeper[donor].port,
+						safekeeper[donor].host, safekeeper[donor].port,
 						err)));
 		return false;
 	}
 	elog(LOG,
 		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
 		 "%d",
-		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
+		 safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32),
 		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
@@ -1274,16 +1274,16 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 }
 
 /*
- * Determine for wk the starting streaming point and send it message
+ * Determine for sk the starting streaming point and send it message
  * 1) Announcing we are elected proposer (which immediately advances epoch if
  *    safekeeper is synced, being important for sync-safekeepers)
  * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
  *    beyond it -- and history of term switching.
  * 
- * Sets wk->startStreamingAt.
+ * Sets sk->startStreamingAt.
  */
 static void
-SendProposerElected(WalKeeper *wk)
+SendProposerElected(Safekeeper *sk)
 {
 	ProposerElected msg;
 	TermHistory *th;
@@ -1298,13 +1298,13 @@ SendProposerElected(WalKeeper *wk)
 	 * there is some WAL on safekeeper, if immediately after bootstrap compute
 	 * wrote some WAL on single sk and died; we stream since the beginning then.
 	 */
-	th = &wk->voteResponse.termHistory;
+	th = &sk->voteResponse.termHistory;
 	/* 
 	 * If any WAL is present on the sk, it must be authorized by some term.
 	 * OTOH, without any WAL there are no term swiches in the log.
 	 */
 	Assert((th->n_entries == 0) ==
-		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
+		   (sk->voteResponse.flushLsn == InvalidXLogRecPtr));
 	/* We must start somewhere. */
 	Assert(propTermHistory.n_entries >= 1);
 
@@ -1319,7 +1319,7 @@ SendProposerElected(WalKeeper *wk)
 	if (i < 0)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
-		wk->startStreamingAt = propTermHistory.entries[0].lsn;
+		sk->startStreamingAt = propTermHistory.entries[0].lsn;
 	}
 	else
 	{
@@ -1331,44 +1331,44 @@ SendProposerElected(WalKeeper *wk)
 		 */
 		if (propTermHistory.entries[i].term == propTerm)
 		{
-			wk->startStreamingAt = wk->voteResponse.flushLsn;
+			sk->startStreamingAt = sk->voteResponse.flushLsn;
 		}
 		else
 		{
 			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
 			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
-														   wk->voteResponse.flushLsn);
-			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
+														   sk->voteResponse.flushLsn);
+			sk->startStreamingAt = Min(propEndLsn, skEndLsn);
 		}
 	}
 
-	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+	Assert(msgQueueHead == NULL || sk->startStreamingAt >= msgQueueHead->req.beginLsn);
 
 	msg.tag = 'e';
 	msg.term = propTerm;
-	msg.startStreamingAt = wk->startStreamingAt;
+	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &propTermHistory;
 
 	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
 	elog(LOG,
 		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
+		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
 	
-	resetStringInfo(&wk->outbuf);
-	pq_sendint64_le(&wk->outbuf, msg.tag);
-	pq_sendint64_le(&wk->outbuf, msg.term);
-	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
+	resetStringInfo(&sk->outbuf);
+	pq_sendint64_le(&sk->outbuf, msg.tag);
+	pq_sendint64_le(&sk->outbuf, msg.term);
+	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
 	for (int i = 0; i < msg.termHistory->n_entries; i++)
 	{
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
 	}
 
-	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;
 
-	StartStreaming(wk);
+	StartStreaming(sk);
 }
 
 /*
@@ -1382,31 +1382,31 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	elog(LOG, "WAL proposer starts streaming at %X/%X",
 		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = proposerGreeting.timeline;
+	cmd.timeline = greetRequest.timeline;
 	cmd.startpoint = startpos;
 	StartReplication(&cmd);
 }
 
 /*
- * Start streaming to safekeeper wk, always updates state to SS_ACTIVE and sets
+ * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets
  * correct event set.
  */
 static void
-StartStreaming(WalKeeper *wk)
+StartStreaming(Safekeeper *sk)
 {
-	int wki = wk - walkeeper;
+	int wki = sk - safekeeper;
 
 	/* 
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
 	 * exactly once for a connection.
 	 */
-	wk->state = SS_ACTIVE;
+	sk->state = SS_ACTIVE;
 
 	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
-		if (msg->req.endLsn <= wk->startStreamingAt)
+		if (msg->req.endLsn <= sk->startStreamingAt)
 		{
-			/* message is already received by this walkeeper */
+			/* message is already received by this safekeeper */
 			msg->ackMask |= 1 << wki;
 		}
 		else
@@ -1418,7 +1418,7 @@ StartStreaming(WalKeeper *wk)
 	}
 
 	/* Call SS_ACTIVE handler to update event set */
-	HandleActiveState(wk, WL_NO_EVENTS);
+	HandleActiveState(sk, WL_NO_EVENTS);
 }
 
 /*
@@ -1430,11 +1430,11 @@ StartStreaming(WalKeeper *wk)
 static void
 SendMessageToNode(int i, WalMessage *msg)
 {
-	WalKeeper  *wk = &walkeeper[i];
+	Safekeeper  *sk = &safekeeper[i];
 
 	/* we shouldn't be already sending something */
-	Assert(wk->currMsg == NULL);
-	Assert(wk->state == SS_ACTIVE);
+	Assert(sk->currMsg == NULL);
+	Assert(sk->state == SS_ACTIVE);
 
 	/*
 	 * Skip already acknowledged messages. Used after reconnection to get to
@@ -1443,21 +1443,21 @@ SendMessageToNode(int i, WalMessage *msg)
 	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 		msg = msg->next;
 
-	wk->currMsg = msg;
+	sk->currMsg = msg;
 
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	HandleActiveState(wk, WL_SOCKET_WRITEABLE);
+	HandleActiveState(sk, WL_SOCKET_WRITEABLE);
 }
 
 /*
- * Broadcast new message to all caught-up walkeepers
+ * Broadcast new message to all caught-up safekeepers
  */
 static void
 BroadcastMessage(WalMessage *msg)
 {
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
+		if (safekeeper[i].state == SS_ACTIVE && safekeeper[i].currMsg == NULL)
 		{
 			SendMessageToNode(i, msg);
 		}
@@ -1494,7 +1494,7 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 	msg->req.epochStartLsn = propEpochStartLsn;
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
-	msg->req.proposerId = proposerGreeting.proposerId;
+	msg->req.proposerId = greetRequest.proposerId;
 	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
 
 	Assert(msg->req.endLsn >= lastSentLsn);
@@ -1503,7 +1503,7 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 }
 
 /*
- * Create WAL message with no data, just to let the walkeepers
+ * Create WAL message with no data, just to let the safekeepers
  * know that commit lsn has advanced.
  */
 static WalMessage *
@@ -1537,7 +1537,7 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	 */
 	msg->req.beginLsn = lsn;
 	msg->req.endLsn = lsn;
-	msg->req.proposerId = proposerGreeting.proposerId;
+	msg->req.proposerId = greetRequest.proposerId;
 
 	/*
 	 * truncateLsn and commitLsn are set just before the message sent, in
@@ -1550,64 +1550,64 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
  * Process all events happened in SS_ACTIVE state, update event set after that.
  */
 static void
-HandleActiveState(WalKeeper *wk, uint32 events)
+HandleActiveState(Safekeeper *sk, uint32 events)
 {
 	uint32 newEvents = WL_SOCKET_READABLE;
 
 	if (events & WL_SOCKET_WRITEABLE)
-		if (!SendAppendRequests(wk))
+		if (!SendAppendRequests(sk))
 			return;
 
 	if (events & WL_SOCKET_READABLE)
-		if (!RecvAppendResponses(wk))
+		if (!RecvAppendResponses(sk))
 			return;
 
 	/*
 	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
 	 * in the buffer.
 	 * 
-	 * wk->currMsg checks if we have pending unsent messages. This check isn't
+	 * sk->currMsg checks if we have pending unsent messages. This check isn't
 	 * necessary now, because we always send queue messages immediately after
 	 * creation. But it's good to have it here in case we change this behavior
 	 * in the future.
 	 */
-	if (wk->currMsg != NULL || wk->flushWrite)
+	if (sk->currMsg != NULL || sk->flushWrite)
 		newEvents |= WL_SOCKET_WRITEABLE;
 
-	UpdateEventSet(wk, newEvents);
+	UpdateEventSet(sk, newEvents);
 }
 
 /*
- * Send queue messages starting from wk->currMsg until the end or non-writable
+ * Send queue messages starting from sk->currMsg until the end or non-writable
  * socket, whichever comes first. Caller should take care of updating event set.
  * 
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
  */
 static bool
-SendAppendRequests(WalKeeper *wk)
+SendAppendRequests(Safekeeper *sk)
 {
-	int wki = wk - walkeeper;
+	int wki = sk - safekeeper;
 	WalMessage *msg;
 	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
 
-	if (wk->flushWrite)
+	if (sk->flushWrite)
 	{
-		if (!AsyncFlush(wk))
+		if (!AsyncFlush(sk))
 			/* 
 			 * AsyncFlush failed, that could happen if the socket is closed or
 			 * we have nothing to write and should wait for writeable socket.
 			 */
-			return wk->state == SS_ACTIVE;
+			return sk->state == SS_ACTIVE;
 
 		/* Event set will be updated in the end of HandleActiveState */
-		wk->flushWrite = false;
+		sk->flushWrite = false;
 	}
 
-	while (wk->currMsg)
+	while (sk->currMsg)
 	{
-		msg = wk->currMsg;
+		msg = sk->currMsg;
 		req = &msg->req;
 
 		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
@@ -1620,20 +1620,20 @@ SendAppendRequests(WalKeeper *wk)
 		 * form the cut version. Only happens for the first
 		 * message.
 		 */
-		if (wk->startStreamingAt > msg->req.beginLsn)
+		if (sk->startStreamingAt > msg->req.beginLsn)
 		{
 			uint32		len;
 			uint32		size;
 
-			Assert(wk->startStreamingAt < req->endLsn);
+			Assert(sk->startStreamingAt < req->endLsn);
 
-			len = msg->req.endLsn - wk->startStreamingAt;
+			len = msg->req.endLsn - sk->startStreamingAt;
 			size = sizeof(AppendRequestHeader) + len;
 			req = malloc(size);
 			*req = msg->req;
-			req->beginLsn = wk->startStreamingAt;
+			req->beginLsn = sk->startStreamingAt;
 			memcpy(req + 1,
-					(char *) (&msg->req + 1) + wk->startStreamingAt -
+					(char *) (&msg->req + 1) + sk->startStreamingAt -
 					msg->req.beginLsn,
 					len);
 		}
@@ -1644,25 +1644,25 @@ SendAppendRequests(WalKeeper *wk)
 				LSN_FORMAT_ARGS(req->beginLsn),
 				LSN_FORMAT_ARGS(req->endLsn),
 				LSN_FORMAT_ARGS(req->commitLsn),
-				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+				LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port);
 
 		/* if this is the first sent message, we should start processing feedback */
-		if (wk->ackMsg == NULL)
-			wk->ackMsg = wk->currMsg;
+		if (sk->ackMsg == NULL)
+			sk->ackMsg = sk->currMsg;
 
 		/*
 		 * We write with msg->size here because the body of the
 		 * message is stored after the end of the WalMessage
 		 * struct, in the allocation for each msg
 		 */
-		writeResult = walprop_async_write(wk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
+		writeResult = walprop_async_write(sk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
 		
 		/* Free up resources */
 		if (req != &msg->req)
 			free(req);
 
 		/* Mark current message as sent, whatever the result is */
-		wk->currMsg = wk->currMsg->next;
+		sk->currMsg = sk->currMsg->next;
 
 		switch (writeResult)
 		{
@@ -1675,14 +1675,14 @@ SendAppendRequests(WalKeeper *wk)
 				 * We still need to call PQflush some more to finish the job.
 				 * Caller function will handle this by setting right event set.
 				 */
-				wk->flushWrite = true;
+				sk->flushWrite = true;
 				return true;
 
 			case PG_ASYNC_WRITE_FAIL:
 				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					wk->host, wk->port, FormatWalKeeperState(wk->state),
-					walprop_error_message(wk->conn));
-				ShutdownConnection(wk);
+					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					walprop_error_message(sk->conn));
+				ShutdownConnection(sk);
 				return false;
 			default:
 				Assert(false);
@@ -1702,10 +1702,10 @@ SendAppendRequests(WalKeeper *wk)
  * NB: This function can call SendMessageToNode and produce new messages.
  */
 static bool
-RecvAppendResponses(WalKeeper *wk)
+RecvAppendResponses(Safekeeper *sk)
 {
 	XLogRecPtr	minQuorumLsn;
-	int wki = wk - walkeeper;
+	int wki = sk - safekeeper;
 	bool readAnything = false;
 
 	while (true)
@@ -1715,36 +1715,36 @@ RecvAppendResponses(WalKeeper *wk)
 		 * necessary error handling or state setting is taken care
 		 * of. We can leave any other work until later.
 		 */
-		wk->feedback.apm.tag = 'a';
-		if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->feedback))
+		sk->appendResponse.apm.tag = 'a';
+		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
+		Assert(sk->ackMsg != NULL && (sk->ackMsg->ackMask & (1 << wki)) == 0);
 
 		/*
-		 * We shouldn't read responses ahead of wk->currMsg, because that will
+		 * We shouldn't read responses ahead of sk->currMsg, because that will
 		 * look like we are receiving responses for messages that haven't been
 		 * sent yet.
 		 */
-		Assert(wk->ackMsg != wk->currMsg);
+		Assert(sk->ackMsg != sk->currMsg);
 
-		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
+		sk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
 											* receiving of this
 											* message */
 
-		wk->ackMsg = wk->ackMsg->next;
+		sk->ackMsg = sk->ackMsg->next;
 		readAnything = true;
 	}
 
 	if (!readAnything)
-		return wk->state == SS_ACTIVE;
+		return sk->state == SS_ACTIVE;
 
-	HandleWalKeeperResponse();
+	HandleSafekeeperResponse();
 
 	/*
-	 * Also send the new commit lsn to all the walkeepers.
+	 * Also send the new commit lsn to all the safekeepers.
 	 *
-	 * FIXME: This is redundant for walkeepers that have other
+	 * FIXME: This is redundant for safekeepers that have other
 	 * outbound messages pending.
 	 */
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
@@ -1754,11 +1754,11 @@ RecvAppendResponses(WalKeeper *wk)
 		lastSentCommitLsn = minQuorumLsn;
 	}
 
-	return wk->state == SS_ACTIVE;
+	return sk->state == SS_ACTIVE;
 }
 
 /*
- * Combine hot standby feedbacks from all walkeepers.
+ * Combine hot standby feedbacks from all safekeepers.
  */
 static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
@@ -1767,19 +1767,19 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	hs->xmin.value = ~0;		/* largest unsigned value */
 	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
 
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].feedback.hs.ts != 0)
+		if (safekeeper[i].appendResponse.hs.ts != 0)
 		{
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
+			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
 			{
-				hs->xmin = walkeeper[i].feedback.hs.xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
+				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
+				hs->ts = safekeeper[i].appendResponse.hs.ts;
 			}
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
+			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
 			{
-				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
+				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
+				hs->ts = safekeeper[i].appendResponse.hs.ts;
 			}
 		}
 	}
@@ -1792,11 +1792,11 @@ static XLogRecPtr
 CalculateDiskConsistentLsn(void)
 {
 	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
+		if (safekeeper[i].appendResponse.diskConsistentLsn < lsn)
 		{
-			lsn = walkeeper[i].feedback.diskConsistentLsn;
+			lsn = safekeeper[i].appendResponse.diskConsistentLsn;
 		}
 	}
 	return lsn;
@@ -1810,10 +1810,10 @@ static XLogRecPtr
 CalculateMinFlushLsn(void)
 {
 	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].feedback.flushLsn < lsn)
-			lsn = walkeeper[i].feedback.flushLsn;
+		if (safekeeper[i].appendResponse.flushLsn < lsn)
+			lsn = safekeeper[i].appendResponse.flushLsn;
 	}
 	return lsn;
 }
@@ -1824,30 +1824,30 @@ CalculateMinFlushLsn(void)
 static XLogRecPtr
 GetAcknowledgedByQuorumWALPosition(void)
 {
-	XLogRecPtr	responses[MAX_WALKEEPERS];
+	XLogRecPtr	responses[MAX_SAFEKEEPERS];
 
 	/*
 	 * Sort acknowledged LSNs
 	 */
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
 		 * terms, so ignore reported LSN until it gets to epochStartLsn.
 		 */
-		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
-			walkeeper[i].feedback.flushLsn : 0;
+		responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ?
+			safekeeper[i].appendResponse.flushLsn : 0;
 	}
-	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
 
 	/*
 	 * Get the smallest LSN committed by quorum
 	 */
-	return responses[n_walkeepers - quorum];
+	return responses[n_safekeepers - quorum];
 }
 
 static void
-HandleWalKeeperResponse(void)
+HandleSafekeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
@@ -1857,30 +1857,30 @@ HandleWalKeeperResponse(void)
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	diskConsistentLsn = CalculateDiskConsistentLsn();
 
-	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.diskConsistentLsn)
 	{
 
-		if (minQuorumLsn > lastFeedback.flushLsn)
-			lastFeedback.flushLsn = minQuorumLsn;
+		if (minQuorumLsn > quorumFeedback.flushLsn)
+			quorumFeedback.flushLsn = minQuorumLsn;
 
-		lastFeedback.diskConsistentLsn = diskConsistentLsn;
+		quorumFeedback.diskConsistentLsn = diskConsistentLsn;
 
 		/* advance the replication slot */
 		if (!syncSafekeepers)
 			ProcessStandbyReply(
 								// write_lsn -  This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
+								quorumFeedback.flushLsn,
 								//flush_lsn - This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
+								quorumFeedback.flushLsn,
 								//apply_lsn - This is what processed and durably saved at pageserver.
-								lastFeedback.diskConsistentLsn,
+								quorumFeedback.diskConsistentLsn,
 								GetCurrentTimestamp(), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
 	{
-		lastFeedback.hs = hsFeedback;
+		quorumFeedback.hs = hsFeedback;
 		if (!syncSafekeepers)
 			ProcessStandbyHSFeedback(hsFeedback.ts,
 									 XidFromFullTransactionId(hsFeedback.xmin),
@@ -1909,7 +1909,7 @@ HandleWalKeeperResponse(void)
 		truncateLsn = minFlushLsn;
 
 	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_safekeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
 	{
 		WalMessage *msg = msgQueueHead;
 		msgQueueHead = msg->next;
@@ -1936,28 +1936,28 @@ HandleWalKeeperResponse(void)
 	 * moment we don't have any good mechanism of defining the healthy and
 	 * most advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
-	 * (due to pageserver connecting to not-synced-walkeeper) we currently
-	 * wait for all seemingly alive walkeepers to get synced.
+	 * (due to pageserver connecting to not-synced-safekeeper) we currently
+	 * wait for all seemingly alive safekeepers to get synced.
 	 */
 	if (syncSafekeepers)
 	{
 		int			n_synced;
 
 		n_synced = 0;
-		for (int i = 0; i < n_walkeepers; i++)
+		for (int i = 0; i < n_safekeepers; i++)
 		{
-			WalKeeper  *wk = &walkeeper[i];
-			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
+			Safekeeper  *sk = &safekeeper[i];
+			bool		synced = sk->appendResponse.commitLsn >= propEpochStartLsn;
 
 			/* alive safekeeper which is not synced yet; wait for it */
-			if (wk->state != SS_OFFLINE && !synced)
+			if (sk->state != SS_OFFLINE && !synced)
 				return;
 			if (synced)
 				n_synced++;
 		}
 		if (n_synced >= quorum)
 		{
-			/* All walkeepers synced! */
+			/* All safekeepers synced! */
 			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
 			exit(0);
 		}
@@ -1969,9 +1969,9 @@ HandleWalKeeperResponse(void)
  * failure.
  */
 static bool
-AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
+AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 {
-	switch (walprop_async_read(wk->conn, buf, buf_size))
+	switch (walprop_async_read(sk->conn, buf, buf_size))
 	{
 		case PG_ASYNC_READ_SUCCESS:
 			return true;
@@ -1981,10 +1981,10 @@ AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", wk->host,
-				 wk->port, FormatWalKeeperState(wk->state),
-				 walprop_error_message(wk->conn));
-			ShutdownConnection(wk);
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
+				 sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
 			return false;
 	}
 	Assert(false);
@@ -2001,14 +2001,14 @@ AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
  * failed, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
+AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 {
 	char *buf;
 	int buf_size;
 	uint64 tag;
 	StringInfoData s;
 
-	if (!(AsyncRead(wk, &buf, &buf_size)))
+	if (!(AsyncRead(sk, &buf, &buf_size)))
 		return false;
 
 	/* parse it */
@@ -2019,9 +2019,9 @@ AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, wk->host,
-			 wk->port, FormatWalKeeperState(wk->state));
-		ResetConnection(wk);
+		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			 sk->port, FormatSafekeeperState(sk->state));
+		ResetConnection(sk);
 		return false;
 	}
 
@@ -2083,43 +2083,43 @@ AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
  * single packet.
  */
 static bool
-BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state)
+BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
 {
 	uint32		events;
 
-	if (!walprop_blocking_write(wk->conn, msg, msg_size))
+	if (!walprop_blocking_write(sk->conn, msg, msg_size))
 	{
 		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-			 wk->host, wk->port, FormatWalKeeperState(wk->state),
-			 walprop_error_message(wk->conn));
-		ShutdownConnection(wk);
+			 sk->host, sk->port, FormatSafekeeperState(sk->state),
+			 walprop_error_message(sk->conn));
+		ShutdownConnection(sk);
 		return false;
 	}
 
-	wk->state = success_state;
+	sk->state = success_state;
 
 	/*
 	 * If the new state will be waiting for events to happen, update the event
 	 * set to wait for those
 	 */
-	events = WalKeeperStateDesiredEvents(success_state);
+	events = SafekeeperStateDesiredEvents(success_state);
 	if (events)
-		UpdateEventSet(wk, events);
+		UpdateEventSet(sk, events);
 
 	return true;
 }
 
 /*
- * Starts a write into the 'i'th WAL keeper's postgres connection, moving to
+ * Starts a write into the 'i'th safekeeper's postgres connection, moving to
  * flush_state (adjusting eventset) if write still needs flushing.
  *
  * Returns false if sending is unfinished (requires flushing or conn failed).
  * Upon failure, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state)
+AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state)
 {
-	switch (walprop_async_write(wk->conn, msg, msg_size))
+	switch (walprop_async_write(sk->conn, msg, msg_size))
 	{
 		case PG_ASYNC_WRITE_SUCCESS:
 			return true;
@@ -2130,14 +2130,14 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
 			 * to the appropriate state. Update the event set at the bottom of
 			 * this function
 			 */
-			wk->state = flush_state;
-			UpdateEventSet(wk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+			sk->state = flush_state;
+			UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-				 wk->host, wk->port, FormatWalKeeperState(wk->state),
-				 walprop_error_message(wk->conn));
-			ShutdownConnection(wk);
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
 			return false;
 		default:
 		    Assert(false);
@@ -2154,7 +2154,7 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
  * WL_SOCKET_WRITEABLE.
  */
 static bool
-AsyncFlush(WalKeeper *wk)
+AsyncFlush(Safekeeper *sk)
 {
 	/*---
 	 * PQflush returns:
@@ -2162,7 +2162,7 @@ AsyncFlush(WalKeeper *wk)
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
 	 */
-	switch (walprop_flush(wk->conn))
+	switch (walprop_flush(sk->conn))
 	{
 		case 0:
 			/* flush is done */
@@ -2172,9 +2172,9 @@ AsyncFlush(WalKeeper *wk)
 			return false;
 		case -1:
 			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-				 wk->host, wk->port, FormatWalKeeperState(wk->state),
-				 walprop_error_message(wk->conn));
-			ResetConnection(wk);
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ResetConnection(sk);
 			return false;
 		default:
 			Assert(false);
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 74ea1cfd5b1..37f8d2075f6 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -22,20 +22,20 @@ CompareLsn(const void *a, const void *b)
 		return 1;
 }
 
-/* Returns a human-readable string corresonding to the WalKeeperState
+/* Returns a human-readable string corresonding to the SafekeeperState
  *
  * The string should not be freed.
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   elog(LOG, "currently in %s state", FormatWalKeeperState(wk->state));
+ *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   elog(LOG, "currently in state [%s]", FormatWalKeeperState(wk->state));
+ *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
  */
 char*
-FormatWalKeeperState(WalKeeperState state)
+FormatSafekeeperState(SafekeeperState state)
 {
 	char* return_val = NULL;
 
@@ -76,11 +76,11 @@ FormatWalKeeperState(WalKeeperState state)
 	return return_val;
 }
 
-/* Asserts that the provided events are expected for given WAL keeper's state */
+/* Asserts that the provided events are expected for given safekeeper's state */
 void
-AssertEventsOkForState(uint32 events, WalKeeper* wk)
+AssertEventsOkForState(uint32 events, Safekeeper* sk)
 {
-	uint32 expected = WalKeeperStateDesiredEvents(wk->state);
+	uint32 expected = SafekeeperStateDesiredEvents(sk->state);
 
 	/* The events are in-line with what we're expecting, under two conditions:
 	 *   (a) if we aren't expecting anything, `events` has no read- or
@@ -99,17 +99,17 @@ AssertEventsOkForState(uint32 events, WalKeeper* wk)
 	{
 		/* To give a descriptive message in the case of failure, we use elog and
 		 * then an assertion that's guaranteed to fail. */
-		elog(WARNING, "events %s mismatched for walkeeper %s:%s in state [%s]",
-			 FormatEvents(events), wk->host, wk->port, FormatWalKeeperState(wk->state));
+		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }
 
-/* Returns the set of events a WAL keeper in this state should be waiting on
+/* Returns the set of events a safekeeper in this state should be waiting on
  *
  * This will return WL_NO_EVENTS (= 0) for some events. */
 uint32
-WalKeeperStateDesiredEvents(WalKeeperState state)
+SafekeeperStateDesiredEvents(SafekeeperState state)
 {
 	uint32 result;
 
@@ -143,7 +143,7 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 		 * Active state does both reading and writing.
 		 * 
 		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
-		 * 	check wk->flushWrite here to set WL_SOCKET_WRITEABLE.
+		 * 	check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
 		 */
 		case SS_SEND_ELECTED_FLUSH:
 		case SS_ACTIVE:
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 9506a6ee887..51308cbe5a4 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -1,5 +1,5 @@
-#ifndef __WALKEEPER_H__
-#define __WALKEEPER_H__
+#ifndef __WALPROPOSER_H__
+#define __WALPROPOSER_H__
 
 #include "access/xlogdefs.h"
 #include "postgres.h"
@@ -13,7 +13,7 @@
 #define SK_MAGIC              0xCafeCeefu
 #define SK_PROTOCOL_VERSION   1
 
-#define MAX_WALKEEPERS        32
+#define MAX_SAFEKEEPERS        32
 #define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
 #define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
@@ -140,7 +140,7 @@ typedef enum
 	 * to read.
 	 */
 	SS_ACTIVE,
-} WalKeeperState;
+} SafekeeperState;
 
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
@@ -153,7 +153,7 @@ typedef uint64 term_t;
 typedef struct ProposerGreeting
 {
 	uint64	   tag;				  /* message tag */
-	uint32	   protocolVersion;	  /* proposer-walkeeper protocol version */
+	uint32	   protocolVersion;	  /* proposer-safekeeper protocol version */
 	uint32	   pgVersion;
 	pg_uuid_t  proposerId;
 	uint64	   systemId;		  /* Postgres system identifier */
@@ -210,7 +210,7 @@ typedef struct VoteResponse {
      * proposer to choose the most advanced one.
 	 */
 	XLogRecPtr flushLsn;
-	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some safekeeper */
 	TermHistory termHistory;
 } VoteResponse;
 
@@ -229,7 +229,7 @@ typedef struct ProposerElected
 } ProposerElected;
 
 /*
- * Header of request with WAL message sent from proposer to walkeeper.
+ * Header of request with WAL message sent from proposer to safekeeper.
  */
 typedef struct AppendRequestHeader
 {
@@ -242,7 +242,7 @@ typedef struct AppendRequestHeader
 	XLogRecPtr epochStartLsn;
 	XLogRecPtr beginLsn;    /* start position of message in WAL */
 	XLogRecPtr endLsn;      /* end position of message in WAL */
-	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
+	XLogRecPtr commitLsn;   /* LSN committed by quorum of safekeepers */
 	/*
 	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
 	 *  + 1 of last chunk streamed to everyone)
@@ -260,7 +260,7 @@ struct WalMessage
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
 	uint32 ackMask; /* mask of receivers acknowledged receiving of this message */
-	AppendRequestHeader req; /* request to walkeeper (message header) */
+	AppendRequestHeader req; /* request to safekeeper (message header) */
 
 	/* PHANTOM FIELD:
 	 *
@@ -280,7 +280,7 @@ typedef struct HotStandbyFeedback
 } HotStandbyFeedback;
 
 /*
- * Report walkeeper state to proposer
+ * Report safekeeper state to proposer
  */
 typedef struct AppendResponse
 {
@@ -302,9 +302,9 @@ typedef struct AppendResponse
 
 
 /*
- * Descriptor of walkeeper
+ * Descriptor of safekeeper
  */
-typedef struct WalKeeper
+typedef struct Safekeeper
 {
 	char const*        host;
 	char const*        port;
@@ -324,21 +324,21 @@ typedef struct WalKeeper
 	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
-	WalKeeperState     state;         /* walkeeper state machine state */
-	AcceptorGreeting   greet;         /* acceptor greeting  */
+	SafekeeperState     state;         /* safekeeper state machine state */
+	AcceptorGreeting   greetResponse;         /* acceptor greeting  */
 	VoteResponse	   voteResponse;  /* the vote */
-	AppendResponse feedback;		  /* feedback to master */
+	AppendResponse appendResponse;		  /* feedback to master */
 	/*
 	 * Streaming will start here; must be record boundary.
 	 */
 	XLogRecPtr startStreamingAt;
-} WalKeeper;
+} Safekeeper;
 
 
 int        CompareLsn(const void *a, const void *b);
-char*      FormatWalKeeperState(WalKeeperState state);
-void       AssertEventsOkForState(uint32 events, WalKeeper* wk);
-uint32     WalKeeperStateDesiredEvents(WalKeeperState state);
+char*      FormatSafekeeperState(SafekeeperState state);
+void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
+uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
@@ -385,7 +385,7 @@ typedef enum
 	WP_EXEC_SUCCESS_COPYBOTH,
 	/* Any success result other than a single CopyBoth was received. The specifics of the result
 	 * were already logged, but it may be useful to provide an error message indicating which
-	 * walkeeper messed up.
+	 * safekeeper messed up.
 	 *
 	 * Do not expect PQerrorMessage to be appropriately set. */
 	WP_EXEC_UNEXPECTED_SUCCESS,
@@ -441,11 +441,11 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn);
 /*
  * Ergonomic wrapper around PGgetCopyData
  *
- * Reads a CopyData block from a walkeeper, setting *amount to the number
+ * Reads a CopyData block from a safekeeper, setting *amount to the number
  * of bytes returned.
  *
  * This function is allowed to assume certain properties specific to the
- * protocol with the walkeepers, so it should not be used as-is for any
+ * protocol with the safekeepers, so it should not be used as-is for any
  * other purpose.
  *
  * Note: If possible, using <AsyncRead> is generally preferred, because it
@@ -459,7 +459,7 @@ typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
 /*
  * Ergonomic wrapper around PQputCopyData + PQflush
  *
- * Starts to write a CopyData block to a walkeeper.
+ * Starts to write a CopyData block to a safekeeper.
  *
  * For information on the meaning of return codes, refer to PGAsyncWriteResult.
  */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index fe848491300..5626e0802ca 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2850,7 +2850,7 @@ WaitEventTimeout
 WaitPMResult
 WalCloseMethod
 WalLevel
-WalKeeper
+Safekeeper
 WalMessage
 WalRcvData
 WalRcvExecResult

From 234e6e2930234df2782809c619b07495d7957126 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 09:54:56 +0300
Subject: [PATCH 089/214] Report back-pressure trottling status of backend

---
 src/backend/replication/walsender.c | 2 +-
 src/backend/tcop/postgres.c         | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 2f834301cd0..d9dfea8f8d0 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3815,7 +3815,7 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply
 uint64
 backpressure_lag(void)
 {
-	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
+	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0)
 	{
 		XLogRecPtr writePtr;
 		XLogRecPtr flushPtr;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 1c3974aedd1..3f903cb6fc3 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3406,6 +3406,8 @@ ProcessInterrupts(void)
 		if (lag <= 0)
 			break;
 
+		set_ps_display("backpressure throttling");
+
 		elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 		pg_usleep(BACK_PRESSURE_DELAY);
 	}

From a19cfb34c87b4d2603dec1df31061b49a273f600 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 16:27:08 +0300
Subject: [PATCH 090/214] Add max_replication_write_lag

---
 src/backend/access/transam/xloginsert.c |  1 +
 src/backend/replication/walsender.c     |  9 ++++++++-
 src/backend/utils/misc/guc.c            | 12 ++++++++++++
 src/include/access/xloginsert.h         |  1 +
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 306841fb8d2..493f3cb45d4 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -66,6 +66,7 @@ typedef struct
 /* GUCs */
 int			max_replication_apply_lag;
 int			max_replication_flush_lag;
+int			max_replication_write_lag;
 
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index d9dfea8f8d0..3da1a97c30d 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3815,7 +3815,7 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply
 uint64
 backpressure_lag(void)
 {
-	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0)
+	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
 	{
 		XLogRecPtr writePtr;
 		XLogRecPtr flushPtr;
@@ -3831,6 +3831,13 @@ backpressure_lag(void)
 			LSN_FORMAT_ARGS(flushPtr),
 			LSN_FORMAT_ARGS(applyPtr));
 
+		if ((writePtr != UnknownXLogRecPtr
+			&& max_replication_write_lag > 0
+			&& myFlushLsn > writePtr + max_replication_write_lag*MB))
+		{
+			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
+		}
+
 		if ((flushPtr != UnknownXLogRecPtr
 			&& max_replication_flush_lag > 0
 			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index fb2c5fa67ea..db0bf7ed967 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2936,6 +2936,18 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal write lag between master and replicas."),
+			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_write_lag,
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."),
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 45dcaf99d9e..391c1a2716a 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -40,6 +40,7 @@
 
 extern int max_replication_apply_lag;
 extern int max_replication_flush_lag;
+extern int max_replication_write_lag;
 
 /* prototypes for public functions in xloginsert.c: */
 extern void XLogBeginInsert(void);

From af5761f37b6481522b73869b7499d08f24256bff Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 10 Jan 2022 16:39:05 +0300
Subject: [PATCH 091/214] Do not throttle wal sender

---
 src/backend/tcop/postgres.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3f903cb6fc3..7ea135c7c9e 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3389,8 +3389,8 @@ ProcessInterrupts(void)
 	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
 		return;
 
-	// Don't throttle read only transactions
-	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	// Don't throttle read only transactions and wal sender
+	if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
 	{
 		ProcessInterrupts_pg();
 		return;

From 4c259dfe135df9045f07373ec4b4fb60853d31f8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 13 Jan 2022 20:26:07 +0300
Subject: [PATCH 092/214] Don't track acks in walproposer (#119)

---
 src/backend/replication/walproposer.c | 83 ++++++---------------------
 src/include/replication/walproposer.h |  2 -
 2 files changed, 19 insertions(+), 66 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 6f89c23eb2f..c1008440a76 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -133,7 +133,7 @@ static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr start
 static void SendProposerElected(Safekeeper *sk);
 static void WalProposerStartStreaming(XLogRecPtr startpos);
 static void StartStreaming(Safekeeper *sk);
-static void SendMessageToNode(int i, WalMessage *msg);
+static void SendMessageToNode(Safekeeper *sk, WalMessage *msg);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
 static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
@@ -379,7 +379,6 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		initStringInfo(&safekeeper[n_safekeepers].outbuf);
 		safekeeper[n_safekeepers].flushWrite = false;
 		safekeeper[n_safekeepers].currMsg = NULL;
-		safekeeper[n_safekeepers].ackMsg = NULL;
 		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_safekeepers += 1;
 	}
@@ -513,7 +512,6 @@ ShutdownConnection(Safekeeper *sk)
 	sk->state = SS_OFFLINE;
 	sk->flushWrite = false;
 	sk->currMsg = NULL;
-	sk->ackMsg = NULL;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
@@ -1394,7 +1392,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 StartStreaming(Safekeeper *sk)
 {
-	int wki = sk - safekeeper;
+	WalMessage *startMsg = msgQueueHead;
 
 	/* 
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
@@ -1402,23 +1400,14 @@ StartStreaming(Safekeeper *sk)
 	 */
 	sk->state = SS_ACTIVE;
 
-	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-	{
-		if (msg->req.endLsn <= sk->startStreamingAt)
-		{
-			/* message is already received by this safekeeper */
-			msg->ackMask |= 1 << wki;
-		}
-		else
-		{
-			/* event set will be updated inside SendMessageToNode */
-			SendMessageToNode(wki, msg);
-			return;
-		}
-	}
+	while (startMsg != NULL && startMsg->req.endLsn <= sk->startStreamingAt)
+		startMsg = startMsg->next;
+
+	/* We should always have WAL to start from sk->startStreamingAt */
+	Assert(startMsg == NULL || startMsg->req.beginLsn <= sk->startStreamingAt);
 
-	/* Call SS_ACTIVE handler to update event set */
-	HandleActiveState(sk, WL_NO_EVENTS);
+	/* event set will be updated inside SendMessageToNode */
+	SendMessageToNode(sk, startMsg);
 }
 
 /*
@@ -1428,21 +1417,12 @@ StartStreaming(Safekeeper *sk)
  * in case of errors.
  */
 static void
-SendMessageToNode(int i, WalMessage *msg)
+SendMessageToNode(Safekeeper *sk, WalMessage *msg)
 {
-	Safekeeper  *sk = &safekeeper[i];
-
 	/* we shouldn't be already sending something */
 	Assert(sk->currMsg == NULL);
 	Assert(sk->state == SS_ACTIVE);
 
-	/*
-	 * Skip already acknowledged messages. Used after reconnection to get to
-	 * the first not yet sent message. Otherwise we always just send 'msg'.
-	 */
-	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
-		msg = msg->next;
-
 	sk->currMsg = msg;
 
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
@@ -1459,7 +1439,7 @@ BroadcastMessage(WalMessage *msg)
 	{
 		if (safekeeper[i].state == SS_ACTIVE && safekeeper[i].currMsg == NULL)
 		{
-			SendMessageToNode(i, msg);
+			SendMessageToNode(&safekeeper[i], msg);
 		}
 	}
 }
@@ -1488,7 +1468,6 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 
 	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
-	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -1521,7 +1500,6 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 
 	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
-	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -1541,7 +1519,7 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 
 	/*
 	 * truncateLsn and commitLsn are set just before the message sent, in
-	 * SendMessageToNode()
+	 * SendAppendRequests()
 	 */
 	return msg;
 }
@@ -1587,7 +1565,6 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 static bool
 SendAppendRequests(Safekeeper *sk)
 {
-	int wki = sk - safekeeper;
 	WalMessage *msg;
 	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
@@ -1613,8 +1590,6 @@ SendAppendRequests(Safekeeper *sk)
 		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
 		req->truncateLsn = truncateLsn;
 
-		Assert((msg->ackMask & (1 << wki)) == 0);
-
 		/*
 		 * If we need to send this message not from the beginning,
 		 * form the cut version. Only happens for the first
@@ -1646,10 +1621,6 @@ SendAppendRequests(Safekeeper *sk)
 				LSN_FORMAT_ARGS(req->commitLsn),
 				LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port);
 
-		/* if this is the first sent message, we should start processing feedback */
-		if (sk->ackMsg == NULL)
-			sk->ackMsg = sk->currMsg;
-
 		/*
 		 * We write with msg->size here because the body of the
 		 * message is stored after the end of the WalMessage
@@ -1705,7 +1676,6 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	XLogRecPtr	minQuorumLsn;
-	int wki = sk - safekeeper;
 	bool readAnything = false;
 
 	while (true)
@@ -1719,20 +1689,6 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		Assert(sk->ackMsg != NULL && (sk->ackMsg->ackMask & (1 << wki)) == 0);
-
-		/*
-		 * We shouldn't read responses ahead of sk->currMsg, because that will
-		 * look like we are receiving responses for messages that haven't been
-		 * sent yet.
-		 */
-		Assert(sk->ackMsg != sk->currMsg);
-
-		sk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
-											* receiving of this
-											* message */
-
-		sk->ackMsg = sk->ackMsg->next;
 		readAnything = true;
 	}
 
@@ -1908,8 +1864,11 @@ HandleSafekeeperResponse(void)
 	if (minFlushLsn > truncateLsn)
 		truncateLsn = minFlushLsn;
 
-	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_safekeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	/*
+	 * Cleanup message queue up to truncateLsn. These messages were processed
+	 * by all safekeepers because they all reported flushLsn greater than endLsn.
+	 */
+	while (msgQueueHead != NULL && msgQueueHead->req.endLsn < truncateLsn)
 	{
 		WalMessage *msg = msgQueueHead;
 		msgQueueHead = msg->next;
@@ -1919,13 +1878,9 @@ HandleSafekeeperResponse(void)
 	}
 	if (!msgQueueHead)			/* queue is empty */
 		msgQueueTail = NULL;
+
 	/* truncateLsn always points to the first chunk in the queue */
-	if (msgQueueHead)
-	{
-		/* Max takes care of special 0-sized messages */
-		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
-			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
-	}
+	Assert(msgQueueHead == NULL || (truncateLsn >= msgQueueHead->req.beginLsn && truncateLsn <= msgQueueHead->req.endLsn));
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 51308cbe5a4..53f1a6de2fe 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -259,7 +259,6 @@ struct WalMessage
 {
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
-	uint32 ackMask; /* mask of receivers acknowledged receiving of this message */
 	AppendRequestHeader req; /* request to safekeeper (message header) */
 
 	/* PHANTOM FIELD:
@@ -321,7 +320,6 @@ typedef struct Safekeeper
 
 	bool               flushWrite;    /* set to true if we need to call AsyncFlush, to flush pending messages */
 	WalMessage*        currMsg;       /* message that wasn't sent yet or NULL, if we have nothing to send */
-	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
 	SafekeeperState     state;         /* safekeeper state machine state */

From 36e2a3baefe314dee4e6b97754a6c1b72fcc60b5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 14 Jan 2022 20:33:19 +0200
Subject: [PATCH 093/214] Silence excessively noisy logging from walproposer.

In the passing, switch a few places to ereport() instead of elog(), to
avoid the overhead of constructing the string when it's not logged.

Fixes https://github.com/zenithdb/zenith/issues/1066
---
 src/backend/replication/walproposer.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index c1008440a76..2b25eb9b61f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1250,14 +1250,16 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 				rec_start_lsn = pg_ntoh64(rec_start_lsn);
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
 				(void) CreateMessage(rec_start_lsn, buf, len);
-				elog(DEBUG1, "Recover message %X/%X length %d",
-					 LSN_FORMAT_ARGS(rec_start_lsn), len);
+				ereport(DEBUG1,
+						(errmsg("Recover message %X/%X length %d",
+								LSN_FORMAT_ARGS(rec_start_lsn), len)));
 				if (rec_end_lsn >= endpos)
 					break;
 			}
 		}
-		elog(DEBUG1, "end of replication stream at %X/%X: %m",
-			 LSN_FORMAT_ARGS(rec_end_lsn));
+		ereport(DEBUG1,
+				(errmsg("end of replication stream at %X/%X: %m",
+						LSN_FORMAT_ARGS(rec_end_lsn))));
 		walrcv_disconnect(wrconn);
 	}
 	else
@@ -1613,13 +1615,13 @@ SendAppendRequests(Safekeeper *sk)
 					len);
 		}
 
-		elog(LOG,
-				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-				req->endLsn - req->beginLsn,
-				LSN_FORMAT_ARGS(req->beginLsn),
-				LSN_FORMAT_ARGS(req->endLsn),
-				LSN_FORMAT_ARGS(req->commitLsn),
-				LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port);
+		ereport(DEBUG2,
+				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port)));
 
 		/*
 		 * We write with msg->size here because the body of the

From dbc6341d998227dab0413c3dc7b78f4f1d9cfc27 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Mon, 20 Dec 2021 22:20:00 +0300
Subject: [PATCH 094/214] Extend replication protocol with ZenithFeedback
 message.

Add extensible ZenithFeedback part to AppendResponse messages
Pass values sizes together with keys in ZenithFeedback message.

Add standby_status_update fields into ZenithFeedback.
Get rid of diskConsistentLsn field in AppendResponse, because now it is send via ZenithFeedback.
Fix calculation of diskConsistentLsn and instanceSize - take values from latest reply from pageserver
---
 src/backend/replication/walproposer.c | 132 +++++++++++++++++++++-----
 src/backend/replication/walsender.c   |  30 ++++++
 src/include/replication/walproposer.h |  24 ++++-
 3 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 2b25eb9b61f..a3d12f2eab0 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -141,7 +141,6 @@ static void HandleActiveState(Safekeeper *sk, uint32 events);
 static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
-static XLogRecPtr CalculateDiskConsistentLsn(void);
 static XLogRecPtr CalculateMinFlushLsn(void);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
 static void HandleSafekeeperResponse(void);
@@ -672,7 +671,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_OFFLINE:
 			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-					sk->host, sk->port);
+				 sk->host, sk->port);
 			break;			/* actually unreachable, but prevents
 							 * -Wimplicit-fallthrough */
 
@@ -1715,6 +1714,72 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }
 
+void
+ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
+{
+	uint8 nkeys;
+	int i;
+	int32 len;
+
+	/* get number of custom keys */
+	nkeys = pq_getmsgbyte(reply_message);
+
+	for (i = 0; i < nkeys; i++)
+	{
+		const char *key = pq_getmsgstring(reply_message);
+		if (strcmp(key, "current_timeline_size") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->currentInstanceSize = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: current_timeline_size %lu",
+					zf->currentInstanceSize);
+		}
+		else if (strcmp(key, "ps_writelsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->ps_writelsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_writelsn %X/%X",
+					LSN_FORMAT_ARGS(zf->ps_writelsn));
+		}
+		else if (strcmp(key, "ps_flushlsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->ps_flushlsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_flushlsn %X/%X",
+					LSN_FORMAT_ARGS(zf->ps_flushlsn));
+		}
+		else if (strcmp(key, "ps_applylsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->ps_applylsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_applylsn %X/%X",
+					LSN_FORMAT_ARGS(zf->ps_applylsn));
+		}
+		else if (strcmp(key, "ps_replytime") == 0)
+		{
+			pq_getmsgint(reply_message, sizeof(int32)); // read value length
+			zf->ps_replytime = pq_getmsgint64(reply_message);
+			{
+				char	   *replyTimeStr;
+
+				/* Copy because timestamptz_to_str returns a static buffer */
+				replyTimeStr = pstrdup(timestamptz_to_str(zf->ps_replytime));
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_replytime %lu reply_time: %s",
+					zf->ps_replytime, replyTimeStr);
+
+				pfree(replyTimeStr);
+			}
+		}
+		else
+		{
+			len = pq_getmsgint(reply_message, sizeof(int32)); // read value length
+			// Skip unknown keys to support backward compatibile protocol changes
+			elog(LOG, "ParseZenithFeedbackMessage: unknown key: %s len %d", key, len);
+			pq_getmsgbytes(reply_message, len);
+		};
+	}
+}
+
 /*
  * Combine hot standby feedbacks from all safekeepers.
  */
@@ -1743,22 +1808,6 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	}
 }
 
-/*
- * Get minimum of disk consistent LSNs of all safekeepers
- */
-static XLogRecPtr
-CalculateDiskConsistentLsn(void)
-{
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].appendResponse.diskConsistentLsn < lsn)
-		{
-			lsn = safekeeper[i].appendResponse.diskConsistentLsn;
-		}
-	}
-	return lsn;
-}
 
 /*
  * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
@@ -1804,6 +1853,31 @@ GetAcknowledgedByQuorumWALPosition(void)
 	return responses[n_safekeepers - quorum];
 }
 
+/*
+ * Get ZenithFeedback fields from the most advanced safekeeper
+ */
+static void
+GetLatestZentihFeedback(ZenithFeedback *zf)
+{
+	int latest_safekeeper = 0;
+	uint64 replyTime = 0;
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		if (safekeeper[i].appendResponse.zf.ps_replytime > replyTime)
+		{
+			latest_safekeeper = i;
+			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
+			elog(LOG, "safekeeper[%d] replyTime %lu", i, replyTime);
+		}
+	}
+
+	zf->currentInstanceSize = safekeeper[latest_safekeeper].appendResponse.zf.currentInstanceSize;
+	zf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_writelsn;
+	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
+	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
+	zf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.zf.ps_replytime;
+}
+
 static void
 HandleSafekeeperResponse(void)
 {
@@ -1812,17 +1886,18 @@ HandleSafekeeperResponse(void)
 	XLogRecPtr	diskConsistentLsn;
 	XLogRecPtr  minFlushLsn;
 
+
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = CalculateDiskConsistentLsn();
+	diskConsistentLsn = quorumFeedback.zf.ps_flushlsn;
+	// Get ZenithFeedback fields from the most advanced safekeeper
+	GetLatestZentihFeedback(&quorumFeedback.zf);
 
-	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.diskConsistentLsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.zf.ps_flushlsn)
 	{
 
 		if (minQuorumLsn > quorumFeedback.flushLsn)
 			quorumFeedback.flushLsn = minQuorumLsn;
 
-		quorumFeedback.diskConsistentLsn = diskConsistentLsn;
-
 		/* advance the replication slot */
 		if (!syncSafekeepers)
 			ProcessStandbyReply(
@@ -1831,7 +1906,7 @@ HandleSafekeeperResponse(void)
 								//flush_lsn - This is what durably stored in WAL service.
 								quorumFeedback.flushLsn,
 								//apply_lsn - This is what processed and durably saved at pageserver.
-								quorumFeedback.diskConsistentLsn,
+								quorumFeedback.zf.ps_flushlsn,
 								GetCurrentTimestamp(), false);
 	}
 
@@ -2017,10 +2092,19 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 			msg->term = pq_getmsgint64_le(&s);
 			msg->flushLsn = pq_getmsgint64_le(&s);
 			msg->commitLsn = pq_getmsgint64_le(&s);
-			msg->diskConsistentLsn = pq_getmsgint64_le(&s);
 			msg->hs.ts = pq_getmsgint64_le(&s);
 			msg->hs.xmin.value = pq_getmsgint64_le(&s);
 			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
+			{
+				StringInfoData z;
+				z.data = buf + APPENDRESPONSE_FIXEDPART_SIZE;
+				z.len = buf_size - APPENDRESPONSE_FIXEDPART_SIZE;
+				z.cursor = 0;
+				ParseZenithFeedbackMessage(&s, &msg->zf);
+				//advance main StringInfo cursor, because it is checked in pq_getmsgend below
+				s.cursor += z.cursor;
+			}
 			pq_getmsgend(&s);
 			return true;
 		}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 3da1a97c30d..8bda92bf1cf 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -239,6 +239,7 @@ void StartReplication(StartReplicationCmd *cmd);
 static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
+static void ProcessZenithFeedbackMessage(void);
 static void ProcessStandbyHSFeedbackMessage(void);
 static void ProcessRepliesIfAny(void);
 static void ProcessPendingWrites(void);
@@ -1876,6 +1877,10 @@ ProcessStandbyMessage(void)
 			ProcessStandbyHSFeedbackMessage();
 			break;
 
+		case 'z':
+			ProcessZenithFeedbackMessage();
+			break;
+
 		default:
 			ereport(COMMERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -1939,6 +1944,31 @@ ProcessStandbyReplyMessage(void)
 						applyPtr,
 						replyTime,
 						replyRequested);
+
+	elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X",
+					LSN_FORMAT_ARGS(writePtr));
+	elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X",
+					LSN_FORMAT_ARGS(flushPtr));
+	elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X",
+					LSN_FORMAT_ARGS(applyPtr));
+}
+
+// This message is a zenith extension of postgres replication protocol
+static void
+ProcessZenithFeedbackMessage(void)
+{
+	ZenithFeedback zf;
+
+	// consume message length
+	pq_getmsgint64(&reply_message);
+
+	ParseZenithFeedbackMessage(&reply_message, &zf);
+
+	ProcessStandbyReply(zf.ps_writelsn,
+						zf.ps_flushlsn,
+						zf.ps_applylsn,
+						zf.ps_replytime,
+						false);
 }
 
 void
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 53f1a6de2fe..9a4dd028e69 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -278,6 +278,18 @@ typedef struct HotStandbyFeedback
 	FullTransactionId catalog_xmin;
 } HotStandbyFeedback;
 
+
+typedef	struct ZenithFeedback
+{
+	// current size of the timeline on pageserver
+	uint64 currentInstanceSize;
+	// standby_status_update fields that safekeeper received from pageserver
+	XLogRecPtr ps_writelsn;
+	XLogRecPtr ps_flushlsn;
+	XLogRecPtr ps_applylsn;
+	TimestampTz ps_replytime;
+} ZenithFeedback;
+
 /*
  * Report safekeeper state to proposer
  */
@@ -294,11 +306,17 @@ typedef struct AppendResponse
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
 	XLogRecPtr commitLsn;
-	// Part of WAL applied and written to the disk by all pageservers
-	XLogRecPtr diskConsistentLsn;
 	HotStandbyFeedback hs;
+	// Feedback recieved from pageserver includes standby_status_update fields
+	// and custom zenith feedback.
+	// This part of the message is extensible.
+	ZenithFeedback zf;
 } AppendResponse;
 
+// ZenithFeedback is extensible part of the message that is parsed separately
+// Other fields are fixed part
+#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, zf)
+
 
 /*
  * Descriptor of safekeeper
@@ -357,6 +375,8 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									uint32		feedbackEpoch,
 									TransactionId feedbackCatalogXmin,
 									uint32		feedbackCatalogEpoch);
+void ParseZenithFeedbackMessage(StringInfo reply_message,
+								ZenithFeedback *zf);
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 

From d3a2327cf1b85a7acb371a44f948f3ff9ee24322 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 27 Jan 2022 17:23:54 +0300
Subject: [PATCH 095/214] Allow to join empty safekeeper to existing cluster
 (#123)

---
 src/backend/replication/walproposer.c | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index a3d12f2eab0..1ae567f90c6 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1319,6 +1319,34 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = propTermHistory.entries[0].lsn;
+
+		if (sk->startStreamingAt < truncateLsn)
+		{
+			/*
+			 * There's a gap between the WAL starting point and a truncateLsn,
+			 * which can't appear in a normal working cluster. That gap means
+			 * that all safekeepers reported that they have persisted WAL up
+			 * to the truncateLsn before, but now current safekeeper tells
+			 * otherwise.
+			 * 
+			 * Also we have a special condition here, which is empty safekeeper
+			 * with no history. In combination with a gap, that can happen when
+			 * we introduce a new safekeeper to the cluster. This is a rare case,
+			 * which is triggered manually for now, and should be treated with 
+			 * care.
+			 */
+
+			/*
+			 * truncateLsn will not change without ack from current safekeeper,
+			 * and it's aligned to the WAL record, so we can safely start
+			 * streaming from this point.
+			 */
+			sk->startStreamingAt = truncateLsn;
+
+			elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
+				 sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn),
+				 LSN_FORMAT_ARGS(sk->startStreamingAt));
+		}
 	}
 	else
 	{

From 22360bd0b058b178523c3fd177fd857209f8472c Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 20:23:51 +0300
Subject: [PATCH 096/214] Use local relation cache for smgr_exists

refer  #1077
---
 contrib/zenith/pagestore_smgr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 81aa2339779..d2fabd8de66 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -616,6 +616,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
 	ZenithResponse *resp;
+	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
@@ -642,6 +643,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
+	{
+		return true;
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -746,6 +752,9 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 * exist.
 	 */
 	mdunlink(rnode, forkNum, isRedo);
+	if (!RelFileNodeBackendIsTemp(rnode)) {
+		forget_cached_relsize(rnode.node, forkNum);
+	}
 }
 
 /*

From 7140479da93f124272b16142fd3cecea8fdb424b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 5 Feb 2022 01:25:47 +0300
Subject: [PATCH 097/214] Reduce walproposer logging after ca5e7beaf.

---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 1ae567f90c6..ff281c4e2ed 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1895,7 +1895,7 @@ GetLatestZentihFeedback(ZenithFeedback *zf)
 		{
 			latest_safekeeper = i;
 			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
-			elog(LOG, "safekeeper[%d] replyTime %lu", i, replyTime);
+			elog(DEBUG2, "safekeeper[%d] replyTime %lu", i, replyTime);
 		}
 	}
 

From 971abf60e2667654b97c6167175a32f1a9ec99ad Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Mon, 20 Dec 2021 23:08:27 +0300
Subject: [PATCH 098/214] Implement cluster size quota for zenith compute node.
 Use GUC zenith.max_cluster_size to set the limit.

If limit is reached, extend requests will throw out-of-space error.
When current size is too close to the limit - throw a warning.

Do not apply size quota to autovacuum process

Add pg_cluster_size() funciton in zenith extension
---
 contrib/zenith/Makefile               |  3 ++-
 contrib/zenith/libpagestore.c         |  9 +++++++
 contrib/zenith/pagestore_client.h     |  1 +
 contrib/zenith/pagestore_smgr.c       | 32 ++++++++++++++++++++++++
 contrib/zenith/zenith--1.0.sql        |  7 ++++++
 contrib/zenith/zenith.c               | 33 +++++++++++++++++++++++++
 contrib/zenith/zenith_functions.c     | 35 +++++++++++++++++++++++++++
 src/backend/access/transam/xlog.c     | 25 +++++++++++++++++++
 src/backend/replication/walproposer.c | 12 ++++++---
 src/backend/replication/walsender.c   |  2 ++
 src/include/access/xlog.h             |  3 +++
 src/include/replication/walproposer.h |  2 +-
 12 files changed, 158 insertions(+), 6 deletions(-)
 create mode 100644 contrib/zenith/zenith--1.0.sql
 create mode 100644 contrib/zenith/zenith.c
 create mode 100644 contrib/zenith/zenith_functions.c

diff --git a/contrib/zenith/Makefile b/contrib/zenith/Makefile
index 4b706186fff..a4a60d7b88c 100644
--- a/contrib/zenith/Makefile
+++ b/contrib/zenith/Makefile
@@ -4,12 +4,13 @@
 MODULE_big = zenith
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o
+	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o zenith.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 
 EXTENSION = zenith
+DATA = zenith--1.0.sql
 PGFILEDESC = "zenith - cloud storage for PostgreSQL"
 
 ifdef USE_PGXS
diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 2caf5d74b6e..9e16fa2c6fd 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -335,6 +335,15 @@ _PG_init(void)
 							 0,
 							 NULL, NULL, NULL);
 
+	DefineCustomIntVariable("zenith.max_cluster_size",
+							"cluster size limit",
+							NULL,
+							&max_cluster_size,
+							-1, -1, MAX_KILOBYTES,
+							PGC_SIGHUP,
+							GUC_UNIT_BYTE,
+							NULL, NULL,	NULL);
+
 	relsize_hash_init();
 
 	if (page_server != NULL)
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 3643971f254..c040c4b816b 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -133,6 +133,7 @@ extern char *callmemaybe_connstring;
 extern char *zenith_timeline;
 extern char *zenith_tenant;
 extern bool wal_redo;
+extern int32 max_cluster_size;
 
 extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
 extern void smgr_init_zenith(void);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index d2fabd8de66..814e26b91fd 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -62,6 +62,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "catalog/pg_tablespace_d.h"
+#include "postmaster/autovacuum.h"
 
 /*
  * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
@@ -91,6 +92,7 @@ char	   *callmemaybe_connstring;
 char	   *zenith_timeline;
 char	   *zenith_tenant;
 bool		wal_redo = false;
+int32		max_cluster_size;
 
 /* unlogged relation build states */
 typedef enum
@@ -771,6 +773,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			  char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
+	uint64 current_instance_size;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -789,6 +792,35 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	current_instance_size = GetZenithCurrentClusterSize();
+
+	// Do not limit autovacuum processes.
+	if (!IsAutoVacuumWorkerProcess() && max_cluster_size > 0)
+	{
+		if (current_instance_size >= max_cluster_size)
+			ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+					errmsg("could not extend file. Cluster size limit of %d bytes is reached",
+						max_cluster_size),
+					errhint("This limit is defined by zenith.max_cluster_size GUC")));
+		// Throw a warning if current size is too close to the limit.
+		// `too close' is now defined as 10%
+		else if (current_instance_size >= max_cluster_size*0.1)
+		{
+			ereport(WARNING,
+				(errmsg("Current cluster size %lu bytes is close to the limit of %d bytes. ",
+						current_instance_size, max_cluster_size),
+					errhint("This limit is defined by zenith.max_cluster_size GUC")));
+		}
+		else
+		{
+			ereport(WARNING,
+					(errmsg("Current cluster size %lu bytes is not close to the limit of %d bytes. ",
+							current_instance_size, max_cluster_size),
+						errhint("This limit is defined by zenith.max_cluster_size GUC")));
+		}
+	}
+
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
diff --git a/contrib/zenith/zenith--1.0.sql b/contrib/zenith/zenith--1.0.sql
new file mode 100644
index 00000000000..095104c1045
--- /dev/null
+++ b/contrib/zenith/zenith--1.0.sql
@@ -0,0 +1,7 @@
+\echo Use "CREATE EXTENSION zenith" to load this file. \quit
+
+CREATE FUNCTION pg_cluster_size()
+RETURNS bigint
+AS 'MODULE_PATHNAME', 'pg_cluster_size'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
\ No newline at end of file
diff --git a/contrib/zenith/zenith.c b/contrib/zenith/zenith.c
new file mode 100644
index 00000000000..3f2a6cee924
--- /dev/null
+++ b/contrib/zenith/zenith.c
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenith.c
+ *	  Utility functions to expose zenith specific information to user
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith/zenith.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+
+PG_FUNCTION_INFO_V1(pg_cluster_size);
+
+Datum
+pg_cluster_size(PG_FUNCTION_ARGS)
+{
+	int64		size;
+
+	size = GetZenithCurrentClusterSize();
+
+	if (size == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(size);
+}
\ No newline at end of file
diff --git a/contrib/zenith/zenith_functions.c b/contrib/zenith/zenith_functions.c
new file mode 100644
index 00000000000..3e2b137d205
--- /dev/null
+++ b/contrib/zenith/zenith_functions.c
@@ -0,0 +1,35 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenith.c
+ *	  Utility functions to expose zenith specific information to user
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith/zenith.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+#include "access/clog.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(pg_cluster_size);
+
+Datum
+pg_cluster_size(PG_FUNCTION_ARGS)
+{
+	int64		size;
+
+	size = GetZenithCurrentClusterSize();
+
+	if (size == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(size);
+}
\ No newline at end of file
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6ae94d2ecac..18d32ff5b2c 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -750,6 +750,11 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 	XLogRecPtr  lastWrittenPageLSN;
 
+	/*
+	 * size of a timeline in zenith pageserver.
+	 * used to enforce timeline size limit.
+	 */
+	uint64 		zenithCurrentClusterSize;
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -8898,6 +8903,26 @@ SetLastWrittenPageLSN(XLogRecPtr lsn)
 }
 
 
+uint64
+GetZenithCurrentClusterSize(void)
+{
+	uint64 size;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	size = XLogCtl->zenithCurrentClusterSize;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return size;
+}
+
+
+void
+SetZenithCurrentClusterSize(uint64 size)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->zenithCurrentClusterSize = size;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
 
 
 /*
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index ff281c4e2ed..4e10cce8c8b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1758,9 +1758,9 @@ ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
 		if (strcmp(key, "current_timeline_size") == 0)
 		{
 				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				zf->currentInstanceSize = pq_getmsgint64(reply_message);
+				zf->currentClusterSize = pq_getmsgint64(reply_message);
 				elog(DEBUG2, "ParseZenithFeedbackMessage: current_timeline_size %lu",
-					zf->currentInstanceSize);
+					zf->currentClusterSize);
 		}
 		else if (strcmp(key, "ps_writelsn") == 0)
 		{
@@ -1895,11 +1895,10 @@ GetLatestZentihFeedback(ZenithFeedback *zf)
 		{
 			latest_safekeeper = i;
 			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
-			elog(DEBUG2, "safekeeper[%d] replyTime %lu", i, replyTime);
 		}
 	}
 
-	zf->currentInstanceSize = safekeeper[latest_safekeeper].appendResponse.zf.currentInstanceSize;
+	zf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.zf.currentClusterSize;
 	zf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_writelsn;
 	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
 	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
@@ -1920,6 +1919,11 @@ HandleSafekeeperResponse(void)
 	// Get ZenithFeedback fields from the most advanced safekeeper
 	GetLatestZentihFeedback(&quorumFeedback.zf);
 
+	if (!syncSafekeepers)
+	{
+		SetZenithCurrentClusterSize(quorumFeedback.zf.currentClusterSize);
+	}
+
 	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.zf.ps_flushlsn)
 	{
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 8bda92bf1cf..f60db5283b6 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1964,6 +1964,8 @@ ProcessZenithFeedbackMessage(void)
 
 	ParseZenithFeedbackMessage(&reply_message, &zf);
 
+	SetZenithCurrentClusterSize(zf.currentClusterSize);
+
 	ProcessStandbyReply(zf.ps_writelsn,
 						zf.ps_flushlsn,
 						zf.ps_applylsn,
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 986eb957570..e34f1deaf6e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -353,6 +353,9 @@ extern void RemovePromoteSignalFiles(void);
 extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
 extern XLogRecPtr GetLastWrittenPageLSN(void);
 
+extern void SetZenithCurrentClusterSize(uint64 size);
+extern uint64 GetZenithCurrentClusterSize(void);
+
 extern bool PromoteIsTriggered(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 9a4dd028e69..9bd5d8d1508 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -282,7 +282,7 @@ typedef struct HotStandbyFeedback
 typedef	struct ZenithFeedback
 {
 	// current size of the timeline on pageserver
-	uint64 currentInstanceSize;
+	uint64 currentClusterSize;
 	// standby_status_update fields that safekeeper received from pageserver
 	XLogRecPtr ps_writelsn;
 	XLogRecPtr ps_flushlsn;

From da7022bf279c4290f7090f997833bcd3d1fc515a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 9 Feb 2022 14:31:26 +0300
Subject: [PATCH 099/214] Revert "Use local relation cache for smgr_exists"

This reverts commit 45dd8911ec13fd47882685f6d81d7b73696e1b84.

It introduced stable test_isolation failure. There was an idea that adding
strict backpressure settings would help, as absense of this commit could behave
as natural backpressure, but that didn't help. No better fix is immediately
available, so let's revert until sorting this out.

ref https://github.com/zenithdb/zenith/issues/1238
ref https://github.com/zenithdb/zenith/pull/1239
---
 contrib/zenith/pagestore_smgr.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 814e26b91fd..17f0b3b1da4 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -618,7 +618,6 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
 	ZenithResponse *resp;
-	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
@@ -645,11 +644,6 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
-	{
-		return true;
-	}
-
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -754,9 +748,6 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 * exist.
 	 */
 	mdunlink(rnode, forkNum, isRedo);
-	if (!RelFileNodeBackendIsTemp(rnode)) {
-		forget_cached_relsize(rnode.node, forkNum);
-	}
 }
 
 /*

From 3f671c3547f8953d4578431c08d1e921a0f63aa0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 9 Feb 2022 15:26:41 +0200
Subject: [PATCH 100/214] Change the unit of cluster size limit GUC to MB, and
 other fixes.

The GUC is a 32-bit integer, so if the base unit is bytes, the max
limit you can set is only 2 GB. Furthermore, the web console assumed
that the unit is in MB, and set it to 10000 meaning 10 GB, but in
reality it was set to just 10 kB.

Remove the WARNINGs related to cluster size limit. That was probably
supposed to be DEBUG5 or something, because it's extremely noisy
currently. You get the WARNING for *every block* when a relation is
extended.

Some kind of a WARNING when you approach the limit would make sense,
but it's difficult to do in a sensible way with WARNINGs from the
server. Firstly, most applications will ignore WARNINGs, in which case
they don't accomplish anything. If an application forwards them to the
user, that's not great either unless the application user happens to
be the DBA. If you're lucky, the WARNINGs end up in an application log
and the DBA is alerted, but printing the message for every relation
extension is too noisy for that too. An email alert would probably be
best, outside Postgres.

Also don't enforce the limit when extending a temporary or unlogged
relation. They don't count towards the cluster size limit, so it seems
weird to error out on them. And reword the error message a bit.

Fixes https://github.com/zenithdb/zenith/issues/1233
---
 contrib/zenith/libpagestore.c   |  4 ++--
 contrib/zenith/pagestore_smgr.c | 38 ++++++++++++---------------------
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 9e16fa2c6fd..e26028dc6e7 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -339,9 +339,9 @@ _PG_init(void)
 							"cluster size limit",
 							NULL,
 							&max_cluster_size,
-							-1, -1, MAX_KILOBYTES,
+							-1, -1, INT_MAX,
 							PGC_SIGHUP,
-							GUC_UNIT_BYTE,
+							GUC_UNIT_MB,
 							NULL, NULL,	NULL);
 
 	relsize_hash_init();
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 17f0b3b1da4..c4dcff31fc3 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -764,7 +764,6 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			  char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
-	uint64 current_instance_size;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -783,33 +782,24 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	current_instance_size = GetZenithCurrentClusterSize();
-
-	// Do not limit autovacuum processes.
-	if (!IsAutoVacuumWorkerProcess() && max_cluster_size > 0)
+	/*
+	 * Check that the cluster size limit has not been exceeded.
+	 *
+	 * Temporary and unlogged relations are not included in the cluster size measured
+	 * by the page server, so ignore those. Autovacuum processes are also exempt.
+	 */
+	if (max_cluster_size > 0 &&
+		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
+		!IsAutoVacuumWorkerProcess())
 	{
-		if (current_instance_size >= max_cluster_size)
+		uint64		current_size = GetZenithCurrentClusterSize();
+
+		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
 				(errcode(ERRCODE_DISK_FULL),
-					errmsg("could not extend file. Cluster size limit of %d bytes is reached",
-						max_cluster_size),
-					errhint("This limit is defined by zenith.max_cluster_size GUC")));
-		// Throw a warning if current size is too close to the limit.
-		// `too close' is now defined as 10%
-		else if (current_instance_size >= max_cluster_size*0.1)
-		{
-			ereport(WARNING,
-				(errmsg("Current cluster size %lu bytes is close to the limit of %d bytes. ",
-						current_instance_size, max_cluster_size),
+					errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+						   max_cluster_size),
 					errhint("This limit is defined by zenith.max_cluster_size GUC")));
-		}
-		else
-		{
-			ereport(WARNING,
-					(errmsg("Current cluster size %lu bytes is not close to the limit of %d bytes. ",
-							current_instance_size, max_cluster_size),
-						errhint("This limit is defined by zenith.max_cluster_size GUC")));
-		}
 	}
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);

From f5876691fd73d7212bbdb503944058f7c4b62e05 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 9 Feb 2022 16:11:43 +0200
Subject: [PATCH 101/214] Improve error handling while connecting to page
 server.

If anything goes wrong while establishing a connection, don't leak the
socket.

Also, if you get an error while sending the GetPage request, kill the
connection. It's not clear what state it's in, so better to reconnect.
---
 contrib/zenith/libpagestore.c | 131 +++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 43 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index e26028dc6e7..d5e48fc89b3 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -40,7 +40,7 @@ void		_PG_init(void);
 		 errhidestmt(true), errhidecontext(true)))
 
 bool		connected = false;
-PGconn	   *pageserver_conn;
+PGconn	   *pageserver_conn = NULL;
 
 char	   *page_server_connstring_raw;
 
@@ -55,6 +55,8 @@ zenith_connect()
 	char	   *query;
 	int			ret;
 
+	Assert(!connected);
+
 	pageserver_conn = PQconnectdb(page_server_connstring);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -62,6 +64,7 @@ zenith_connect()
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
 		ereport(ERROR,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg("[ZENITH_SMGR] could not establish connection"),
@@ -79,6 +82,8 @@ zenith_connect()
 		res = PQexec(pageserver_conn, query);
 		if (PQresultStatus(res) != PGRES_COMMAND_OK)
 		{
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
 			zenith_log(ERROR,
 					   "[ZENITH_SMGR] callmemaybe command failed");
 		}
@@ -88,8 +93,12 @@ zenith_connect()
 	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
 	ret = PQsendQuery(pageserver_conn, query);
 	if (ret != 1)
+	{
+		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
 		zenith_log(ERROR,
 				   "[ZENITH_SMGR] failed to start dispatcher_loop on pageserver");
+	}
 
 	while (PQisBusy(pageserver_conn))
 	{
@@ -109,8 +118,15 @@ zenith_connect()
 		if (wc & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(pageserver_conn))
+			{
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+				PQfinish(pageserver_conn);
+				pageserver_conn = NULL;
+
 				zenith_log(ERROR, "[ZENITH_SMGR] failed to get handshake from pageserver: %s",
-						   PQerrorMessage(pageserver_conn));
+						   msg);
+			}
 		}
 	}
 
@@ -128,59 +144,88 @@ zenith_call(ZenithRequest *request)
 	StringInfoData resp_buff;
 	ZenithResponse *resp;
 
-	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	PG_TRY();
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
-	}
+		/* If the connection was lost for some reason, reconnect */
+		if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+		{
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			connected = false;
+		}
 
-	if (!connected)
-		zenith_connect();
+		if (!connected)
+			zenith_connect();
 
-	req_buff = zm_pack_request(request);
+		req_buff = zm_pack_request(request);
 
-	/* send request */
-	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
-	{
-		zenith_log(ERROR, "failed to send page request: %s",
-				   PQerrorMessage(pageserver_conn));
-	}
-	pfree(req_buff.data);
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output and
+		 * TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
+		{
+			zenith_log(ERROR, "failed to send page request: %s",
+					   PQerrorMessage(pageserver_conn));
+		}
+		pfree(req_buff.data);
 
-	if (message_level_is_interesting(PqPageStoreTrace))
-	{
-		char	   *msg = zm_to_string((ZenithMessage *) request);
+		if (message_level_is_interesting(PqPageStoreTrace))
+		{
+			char	   *msg = zm_to_string((ZenithMessage *) request);
 
-		zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
-		pfree(msg);
-	}
+			zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
+			pfree(msg);
+		}
 
-	/* read response */
-	resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
-	resp_buff.cursor = 0;
+		/* read response */
+		resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
+		resp_buff.cursor = 0;
 
-	if (resp_buff.len == -1)
-		zenith_log(ERROR, "end of COPY");
-	else if (resp_buff.len == -2)
-		zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+		if (resp_buff.len == -1)
+			zenith_log(ERROR, "end of COPY");
+		else if (resp_buff.len == -2)
+			zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 
-	resp = zm_unpack_response(&resp_buff);
-	PQfreemem(resp_buff.data);
+		resp = zm_unpack_response(&resp_buff);
+		PQfreemem(resp_buff.data);
 
-	if (message_level_is_interesting(PqPageStoreTrace))
-	{
-		char	   *msg = zm_to_string((ZenithMessage *) resp);
+		if (message_level_is_interesting(PqPageStoreTrace))
+		{
+			char	   *msg = zm_to_string((ZenithMessage *) resp);
 
-		zenith_log(PqPageStoreTrace, "Got response: %s", msg);
-		pfree(msg);
-	}
+			zenith_log(PqPageStoreTrace, "Got response: %s", msg);
+			pfree(msg);
+		}
 
-	/*
-	 * XXX: zm_to_string leak strings. Check with what memory contex all this
-	 * methods are called.
-	 */
+		/*
+		 * XXX: zm_to_string leak strings. Check with what memory contex all this
+		 * methods are called.
+		 */
+	}
+	PG_CATCH();
+	{
+		/*
+		 * If anything goes wrong while we were sending a request, it's not
+		 * clear what state the connection is in. For example, if we sent the
+		 * request but didn't receive a response yet, we might receive the
+		 * response some time later after we have already sent a new unrelated
+		 * request. Close the connection to avoid getting confused.
+		 */
+		if (connected)
+		{
+			zenith_log(LOG, "dropping connection to page server due to error");
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			connected = false;
+		}
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
 	return (ZenithResponse *) resp;
 }

From bbbe1ce8e3b52a54effccf708d64d85769bc9053 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 9 Feb 2022 16:11:48 +0200
Subject: [PATCH 102/214] Make getpage requests interruptible.

Fixes https://github.com/zenithdb/zenith/issues/1224
---
 contrib/zenith/libpagestore.c | 41 ++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index d5e48fc89b3..d3e20fc6411 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -136,6 +136,45 @@ zenith_connect()
 	connected = true;
 }
 
+/*
+ * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
+ */
+static int
+call_PQgetCopyData(PGconn *conn, char **buffer)
+{
+	int			ret;
+
+retry:
+	ret = PQgetCopyData(conn, buffer, 1 /* async */);
+
+	if (ret == 0)
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(conn))
+				zenith_log(ERROR, "could not get response from pageserver: %s",
+						   PQerrorMessage(conn));
+		}
+
+		goto retry;
+	}
+
+	return ret;
+}
+
 
 static ZenithResponse *
 zenith_call(ZenithRequest *request)
@@ -183,7 +222,7 @@ zenith_call(ZenithRequest *request)
 		}
 
 		/* read response */
-		resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
+		resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
 		resp_buff.cursor = 0;
 
 		if (resp_buff.len == -1)

From 9c1e11dffc44e651ccf22d1df760e7eea7beaeec Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 10 Feb 2022 11:27:35 +0200
Subject: [PATCH 103/214] Fix memory leak of messages received from
 safekeepers.

Fixes https://github.com/zenithdb/zenith/issues/822
---
 .../libpqwalproposer/libpqwalproposer.c       | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index f6714c08128..085f1fcfe57 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -12,6 +12,7 @@ struct WalProposerConn
 {
 	PGconn* pg_conn;
 	bool    is_nonblocking; /* whether the connection is non-blocking */
+	char   *recvbuf;	/* last received data from libpqprop_async_read */
 };
 
 /* Prototypes for exported functions */
@@ -112,6 +113,7 @@ libpqprop_connect_start(char* conninfo)
 	conn = palloc(sizeof(WalProposerConn));
 	conn->pg_conn = pg_conn;
 	conn->is_nonblocking = false; /* connections always start in blocking mode */
+	conn->recvbuf = NULL;
 	return conn;
 }
 
@@ -247,18 +249,36 @@ libpqprop_flush(WalProposerConn* conn)
 static void
 libpqprop_finish(WalProposerConn* conn)
 {
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
 	PQfinish(conn->pg_conn);
 	pfree(conn);
 }
 
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
 static PGAsyncReadResult
 libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 {
 	int result;
 
+	if (conn->recvbuf != NULL)
+	{
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
+	}
+
 	/* Call PQconsumeInput so that we have the data we need */
 	if (!PQconsumeInput(conn->pg_conn))
+	{
+		*amount = 0;
+		*buf = NULL;
 		return PG_ASYNC_READ_FAIL;
+	}
 
 	/* The docs for PQgetCopyData list the return values as:
 	 *      0 if the copy is still in progress, but no "complete row" is
@@ -272,9 +292,11 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
+	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
 	{
 		case 0:
+			*amount = 0;
+			*buf = NULL;
 			return PG_ASYNC_READ_TRY_AGAIN;
 		case -1:
 		{
@@ -292,13 +314,18 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 
 			/* If there was actually an error, it'll be properly reported by
 			 * calls to PQerrorMessage -- we don't have to do anything else */
+			*amount = 0;
+			*buf = NULL;
 			return PG_ASYNC_READ_FAIL;
 		}
 		case -2:
+			*amount = 0;
+			*buf = NULL;
 			return PG_ASYNC_READ_FAIL;
 		default:
 			/* Positive values indicate the size of the returned result */
 			*amount = result;
+			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }

From 9887f03c14f58a558ecb03e8e7fe2665132fc1aa Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 11 Feb 2022 14:43:26 +0300
Subject: [PATCH 104/214] Initialize pgxactoff for walproposer

refer #1244
---
 src/backend/replication/walproposer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4e10cce8c8b..7c7707e2192 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -174,6 +174,7 @@ WalProposerMain(Datum main_arg)
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
+	InitProcessPhase2();
 
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)

From 6bc4ba6bfcacd199da217e9bce7b313e59e98324 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 14 Feb 2022 19:53:07 +0300
Subject: [PATCH 105/214] Fix compilation warning after 8524105dc.

---
 src/backend/replication/walproposer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7c7707e2192..1ceae1d6dd9 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -50,6 +50,7 @@
 #include "postmaster/interrupt.h"
 #include "postmaster/postmaster.h"
 #include "storage/pmsignal.h"
+#include "storage/proc.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"

From 2f338a12ca53ac044a0ec7f25675a3883f52240d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 14 Feb 2022 22:07:18 +0300
Subject: [PATCH 106/214] Fix more compiler warnings.

---
 src/backend/postmaster/seccomp.c                          | 4 +++-
 .../replication/libpqwalproposer/libpqwalproposer.c       | 8 ++++++++
 src/backend/replication/walproposer_utils.c               | 6 +++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
index 4ff34ebbd66..03971a072cf 100644
--- a/src/backend/postmaster/seccomp.c
+++ b/src/backend/postmaster/seccomp.c
@@ -184,8 +184,10 @@ do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
 static void
 die(int code, const char *str)
 {
+	/* work around gcc ignoring that it shouldn't warn on (void) result being unused */
+	ssize_t _unused pg_attribute_unused();
 	/* Best effort write to stderr */
-	(void)write(fileno(stderr), str, strlen(str));
+	_unused = write(fileno(stderr), str, strlen(str));
 
 	/* XXX: we don't want to run any atexit callbacks */
 	_exit(code);
diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 085f1fcfe57..a12a2ee04bc 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -143,6 +143,10 @@ libpqprop_connect_poll(WalProposerConn* conn)
 			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
 			/* This return is never actually reached, but it's here to make the compiler happy */
 			return WP_CONN_POLLING_FAILED;
+
+		default:
+			Assert(false);
+			return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
 	}
 
 	return return_val;
@@ -226,6 +230,10 @@ libpqprop_get_query_result(WalProposerConn* conn)
 		case PGRES_PIPELINE_ABORTED:
 			return_val = WP_EXEC_FAILED;
 			break;
+
+		default:
+			Assert(false);
+			return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
 	}
 
 	if (unexpected_success)
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 37f8d2075f6..7a593a71778 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -111,7 +111,7 @@ AssertEventsOkForState(uint32 events, Safekeeper* sk)
 uint32
 SafekeeperStateDesiredEvents(SafekeeperState state)
 {
-	uint32 result;
+	uint32 result = WL_NO_EVENTS;
 
 	/* If the state doesn't have a modifier, we can check the base state */
 	switch (state)
@@ -154,6 +154,10 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
 		case SS_OFFLINE:
 			result = WL_NO_EVENTS;
 			break;
+
+		default:
+			Assert(false);
+			break;
 	}
 
 	return result;

From ebe340f31ce3adbfb90baa00d6176b54230215fa Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 16 Feb 2022 12:09:50 +0200
Subject: [PATCH 107/214] Remove dead code in handling ZenithFeedback part of
 an AppendResponse.

The constructed StringInfoData 'z' variable wasn't used for anything, we
passed the original 's' StringInfo directly to ParseZenithFeedbackMessage.
That's fine, but let's remove the dead code.
---
 src/backend/replication/walproposer.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 1ceae1d6dd9..cf473d29215 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1744,6 +1744,7 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }
 
+/* Parse a ZenithFeedback message, or the ZenithFeedback part of an AppendResponse */
 void
 ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
 {
@@ -2130,15 +2131,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 			msg->hs.xmin.value = pq_getmsgint64_le(&s);
 			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
 			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-			{
-				StringInfoData z;
-				z.data = buf + APPENDRESPONSE_FIXEDPART_SIZE;
-				z.len = buf_size - APPENDRESPONSE_FIXEDPART_SIZE;
-				z.cursor = 0;
 				ParseZenithFeedbackMessage(&s, &msg->zf);
-				//advance main StringInfo cursor, because it is checked in pq_getmsgend below
-				s.cursor += z.cursor;
-			}
 			pq_getmsgend(&s);
 			return true;
 		}

From c2f33bada3f6044556d6b1da7c3f6b43ad0885b1 Mon Sep 17 00:00:00 2001
From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com>
Date: Fri, 18 Feb 2022 08:52:01 -0800
Subject: [PATCH 108/214] Expose reading a relation page at a specific LSN
 (#131)

* Expose reading a relation page at a specific LSN

* Addressing comments
---
 contrib/zenith/pagestore_client.h             |   4 +
 contrib/zenith/pagestore_smgr.c               |  69 ++++----
 .../zenith_test_utils--1.0.sql                |  10 ++
 contrib/zenith_test_utils/zenithtest.c        | 155 +++++++++++++++++-
 4 files changed, 208 insertions(+), 30 deletions(-)

diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index c040c4b816b..a5dcd1efc06 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -156,6 +156,10 @@ extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum);
 extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 						char *buffer);
+
+extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer);
+
 extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index c4dcff31fc3..7f3dc0bb09b 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -917,40 +917,20 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 }
 
 /*
- *	zenith_read() -- Read the specified block from a relation.
+ * While function is defined in the zenith extension it's used within zenith_test_utils directly.
+ * To avoid breaking tests in the runtime please keep function signature in sync.
  */
-void
-zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
-			char *buffer)
+void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer)
 {
 	ZenithResponse *resp;
-	bool		latest;
-	XLogRecPtr	request_lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdread(reln, forkNum, blkno, buffer);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithGetPageRequest request = {
 			.req.tag = T_ZenithGetPageRequest,
-			.req.latest = latest,
+			.req.latest = request_latest,
 			.req.lsn = request_lsn,
-			.rnode = reln->smgr_rnode.node,
+			.rnode = rnode,
 			.forknum = forkNum,
 			.blkno = blkno
 		};
@@ -969,9 +949,9 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							blkno,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							rnode.spcNode,
+							rnode.dbNode,
+							rnode.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -983,6 +963,37 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	pfree(resp);
+}
+
+/*
+ *	zenith_read() -- Read the specified block from a relation.
+ */
+void
+zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			char *buffer)
+{
+	bool		latest;
+	XLogRecPtr	request_lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
index dbf18288fd4..d595b043abf 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -12,3 +12,13 @@ RETURNS VOID
 AS 'MODULE_PATHNAME', 'clear_buffer_cache'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
+
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
+LANGUAGE C PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index 2d42110cf36..bd867755e61 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -9,17 +9,34 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
-#include "fmgr.h"
 
+#include "access/relation.h"
 #include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/namespace.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/varlena.h"
 
 
 PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
+PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
+PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
+
+/*
+ * This function is defined in the zenith extension, such declaration is fragile.
+ * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
+ */
+extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer);
 
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
@@ -117,3 +134,139 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+
+/*
+ * Reads the page from page server without buffer cache
+ * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN
+ * NULL read lsn will result in reading the latest version.
+ *
+ * Note: reading latest version will result in waiting for latest changes to reach the page server,
+ *       if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
+ */
+Datum
+get_raw_page_at_lsn(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page;
+	ForkNumber	forknum;
+	RangeVar   *relrv;
+	Relation	rel;
+	char	   *raw_page_data;
+	text	   *relname;
+	text	   *forkname;
+	uint32		blkno;
+
+	bool request_latest = PG_ARGISNULL(3);
+	uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
+		PG_RETURN_NULL();
+
+	relname = PG_GETARG_TEXT_PP(0);
+	forkname = PG_GETARG_TEXT_PP(1);
+	blkno = PG_GETARG_UINT32(2);
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	/* Check that this relation has storage */
+	if (rel->rd_rel->relkind == RELKIND_VIEW)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from view \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from composite type \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from foreign table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned index \"%s\"",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+
+	forknum = forkname_to_number(text_to_cstring(forkname));
+
+	/* Initialize buffer to copy to */
+	raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+	raw_page_data = VARDATA(raw_page);
+
+	zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_BYTEA_P(raw_page);
+}
+
+/*
+ * Another option to read a relation page from page server without cache
+ * this version doesn't validate input and allows reading blocks of dropped relations
+ *
+ * Note: reading latest version will result in waiting for latest changes to reach the page server,
+ *  if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
+ */
+Datum
+get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
+{
+	char	   *raw_page_data;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				errmsg("must be superuser to use raw page functions")));
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) ||
+		PG_ARGISNULL(3) || PG_ARGISNULL(4))
+		PG_RETURN_NULL();
+
+	{
+		RelFileNode rnode = {
+			.spcNode = PG_GETARG_OID(0),
+			.dbNode  = PG_GETARG_OID(1),
+			.relNode = PG_GETARG_OID(2)
+		};
+
+		ForkNumber forknum = PG_GETARG_UINT32(3);
+
+		uint32 blkno = PG_GETARG_UINT32(4);
+		bool request_latest = PG_ARGISNULL(5);
+		uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
+
+
+		/* Initialize buffer to copy to */
+		bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+		raw_page_data = VARDATA(raw_page);
+
+		zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		PG_RETURN_BYTEA_P(raw_page);
+	}
+}

From ed3c9a85f7d97cc353c3763b284b6f5e6888fcc0 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Thu, 17 Feb 2022 19:16:49 +0300
Subject: [PATCH 109/214] Add backpressure_lsns() function. Fix zenith feedback
 processing

---
 contrib/zenith/zenith--1.0.sql        | 12 ++++-
 contrib/zenith/zenith.c               | 37 ++++++++++++++-
 contrib/zenith/zenith_functions.c     | 35 --------------
 src/backend/replication/walproposer.c | 67 +++++++++++++++++++++++++--
 src/backend/replication/walsender.c   |  6 ++-
 src/backend/storage/ipc/ipci.c        |  6 +++
 src/include/replication/walproposer.h | 12 +++++
 7 files changed, 130 insertions(+), 45 deletions(-)
 delete mode 100644 contrib/zenith/zenith_functions.c

diff --git a/contrib/zenith/zenith--1.0.sql b/contrib/zenith/zenith--1.0.sql
index 095104c1045..e414be8ceea 100644
--- a/contrib/zenith/zenith--1.0.sql
+++ b/contrib/zenith/zenith--1.0.sql
@@ -4,4 +4,14 @@ CREATE FUNCTION pg_cluster_size()
 RETURNS bigint
 AS 'MODULE_PATHNAME', 'pg_cluster_size'
 LANGUAGE C STRICT
-PARALLEL UNSAFE;
\ No newline at end of file
+PARALLEL UNSAFE;
+
+CREATE FUNCTION backpressure_lsns(
+    OUT received_lsn pg_lsn,
+    OUT disk_consistent_lsn pg_lsn,
+    OUT remote_consistent_lsn pg_lsn
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'backpressure_lsns'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/contrib/zenith/zenith.c b/contrib/zenith/zenith.c
index 3f2a6cee924..e88984d918c 100644
--- a/contrib/zenith/zenith.c
+++ b/contrib/zenith/zenith.c
@@ -15,9 +15,15 @@
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
-
+#include "catalog/pg_type.h"
+#include "replication/walsender.h"
+#include "replication/walproposer.h"
+#include "funcapi.h"
+#include "access/htup_details.h"
+#include "utils/pg_lsn.h"
 
 PG_FUNCTION_INFO_V1(pg_cluster_size);
+PG_FUNCTION_INFO_V1(backpressure_lsns);
 
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -30,4 +36,31 @@ pg_cluster_size(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 
 	PG_RETURN_INT64(size);
-}
\ No newline at end of file
+}
+
+
+Datum
+backpressure_lsns(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr writePtr;
+	XLogRecPtr flushPtr;
+	XLogRecPtr applyPtr;
+	Datum		values[3];
+	bool		nulls[3];
+	TupleDesc	tupdesc;
+
+	zenith_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+
+	tupdesc = CreateTemplateTupleDesc(3);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = LSNGetDatum(writePtr);
+	values[1] = LSNGetDatum(flushPtr);
+	values[2] = LSNGetDatum(applyPtr);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
diff --git a/contrib/zenith/zenith_functions.c b/contrib/zenith/zenith_functions.c
deleted file mode 100644
index 3e2b137d205..00000000000
--- a/contrib/zenith/zenith_functions.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * zenith.c
- *	  Utility functions to expose zenith specific information to user
- *
- * IDENTIFICATION
- *	 contrib/zenith/zenith.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-#include "fmgr.h"
-
-#include "access/xact.h"
-#include "access/clog.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-
-
-PG_MODULE_MAGIC;
-
-PG_FUNCTION_INFO_V1(pg_cluster_size);
-
-Datum
-pg_cluster_size(PG_FUNCTION_ARGS)
-{
-	int64		size;
-
-	size = GetZenithCurrentClusterSize();
-
-	if (size == 0)
-		PG_RETURN_NULL();
-
-	PG_RETURN_INT64(size);
-}
\ No newline at end of file
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index cf473d29215..7752ae965d3 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -151,6 +151,7 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
 static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
 static bool AsyncFlush(Safekeeper *sk);
 
+
 /*
  * WAL proposer bgworker entry point.
  */
@@ -1884,6 +1885,52 @@ GetAcknowledgedByQuorumWALPosition(void)
 	return responses[n_safekeepers - quorum];
 }
 
+
+static ZenithFeedbackState *zf_state;
+
+/*
+ * ZenithFeedbackShmemSize --- report amount of shared memory space needed
+ */
+Size
+ZenithFeedbackShmemSize(void)
+{
+	return sizeof(ZenithFeedbackState);
+}
+
+bool
+ZenithFeedbackShmemInit(void)
+{
+	bool		found;
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	zf_state = ShmemInitStruct("Zenith Feedback",
+								sizeof(ZenithFeedbackState),
+								&found);
+	LWLockRelease(AddinShmemInitLock);
+
+	return found;
+}
+
+void
+zenith_feedback_set(ZenithFeedback *zf)
+{
+	SpinLockAcquire(&zf_state->mutex);
+	memcpy(&zf_state->feedback, zf, sizeof(ZenithFeedback));
+	SpinLockRelease(&zf_state->mutex);
+}
+
+
+void
+zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
+{
+	SpinLockAcquire(&zf_state->mutex);
+	*writeLsn = zf_state->feedback.ps_writelsn;
+	*flushLsn = zf_state->feedback.ps_flushlsn;
+	*applyLsn = zf_state->feedback.ps_applylsn;
+	SpinLockRelease(&zf_state->mutex);
+}
+
+
 /*
  * Get ZenithFeedback fields from the most advanced safekeeper
  */
@@ -1891,13 +1938,13 @@ static void
 GetLatestZentihFeedback(ZenithFeedback *zf)
 {
 	int latest_safekeeper = 0;
-	uint64 replyTime = 0;
+	XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
 	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (safekeeper[i].appendResponse.zf.ps_replytime > replyTime)
+		if (safekeeper[i].appendResponse.zf.ps_writelsn > ps_writelsn)
 		{
 			latest_safekeeper = i;
-			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
+			ps_writelsn = safekeeper[i].appendResponse.zf.ps_writelsn;
 		}
 	}
 
@@ -1906,6 +1953,16 @@ GetLatestZentihFeedback(ZenithFeedback *zf)
 	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
 	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
 	zf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.zf.ps_replytime;
+
+	elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu,"
+			  " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
+		zf->currentClusterSize,
+		LSN_FORMAT_ARGS(zf->ps_writelsn),
+		LSN_FORMAT_ARGS(zf->ps_flushlsn),
+		LSN_FORMAT_ARGS(zf->ps_applylsn),
+		zf->ps_replytime);
+
+	zenith_feedback_set(zf);
 }
 
 static void
@@ -1919,11 +1976,11 @@ HandleSafekeeperResponse(void)
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	diskConsistentLsn = quorumFeedback.zf.ps_flushlsn;
-	// Get ZenithFeedback fields from the most advanced safekeeper
-	GetLatestZentihFeedback(&quorumFeedback.zf);
 
 	if (!syncSafekeepers)
 	{
+		// Get ZenithFeedback fields from the most advanced safekeeper
+		GetLatestZentihFeedback(&quorumFeedback.zf);
 		SetZenithCurrentClusterSize(quorumFeedback.zf.currentClusterSize);
 	}
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f60db5283b6..2c4bcae0ab7 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1964,6 +1964,8 @@ ProcessZenithFeedbackMessage(void)
 
 	ParseZenithFeedbackMessage(&reply_message, &zf);
 
+	zenith_feedback_set(&zf);
+
 	SetZenithCurrentClusterSize(zf.currentClusterSize);
 
 	ProcessStandbyReply(zf.ps_writelsn,
@@ -3854,10 +3856,10 @@ backpressure_lag(void)
 		XLogRecPtr applyPtr;
 		XLogRecPtr myFlushLsn = GetFlushRecPtr();
 
-		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
+		zenith_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
 		#define MB ((XLogRecPtr)1024*1024)
 
-		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
+		elog(DEBUG2, "current flushLsn %X/%X ZenithFeedback: write %X/%X flush %X/%X apply %X/%X",
 			LSN_FORMAT_ARGS(myFlushLsn),
 			LSN_FORMAT_ARGS(writePtr),
 			LSN_FORMAT_ARGS(flushPtr),
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3e4ec53a97e..5fb07a87eb8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -34,6 +34,7 @@
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "replication/walproposer.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
@@ -150,6 +151,9 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
+
+		size = add_size(size, ZenithFeedbackShmemSize());
+
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -270,6 +274,8 @@ CreateSharedMemoryAndSemaphores(void)
 	SyncScanShmemInit();
 	AsyncShmemInit();
 
+	ZenithFeedbackShmemInit();
+
 #ifdef EXEC_BACKEND
 
 	/*
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 9bd5d8d1508..159af4f4bdc 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -290,6 +290,14 @@ typedef	struct ZenithFeedback
 	TimestampTz ps_replytime;
 } ZenithFeedback;
 
+
+typedef struct ZenithFeedbackState
+{
+	slock_t		mutex;
+	ZenithFeedback feedback;
+
+} ZenithFeedbackState;
+
 /*
  * Report safekeeper state to proposer
  */
@@ -380,6 +388,10 @@ void ParseZenithFeedbackMessage(StringInfo reply_message,
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 
+Size ZenithFeedbackShmemSize(void);
+bool ZenithFeedbackShmemInit(void);
+void zenith_feedback_set(ZenithFeedback *zf);
+void zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 /* libpqwalproposer hooks & helper type */
 

From cc281ef4d226e5965af40bdc01e1066118b0125d Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 23 Feb 2022 18:50:31 +0300
Subject: [PATCH 110/214] Fix zenith_test_utils linkage on macOS

Use function pointer to perform a cross-extension calls.
---
 contrib/zenith_test_utils/Makefile     |  3 +++
 contrib/zenith_test_utils/zenithtest.c | 25 ++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith_test_utils/Makefile b/contrib/zenith_test_utils/Makefile
index 9203f2349d3..5b2fcdc18fe 100644
--- a/contrib/zenith_test_utils/Makefile
+++ b/contrib/zenith_test_utils/Makefile
@@ -10,11 +10,14 @@ EXTENSION = zenith_test_utils
 DATA = zenith_test_utils--1.0.sql
 PGFILEDESC = "zenith_test_utils - helpers for zenith testing and debugging"
 
+EXTRA_INSTALL=contrib/zenith
+
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
+PG_CPPFLAGS = -I$(top_srcdir)/contrib
 subdir = contrib/zenith_test_utils
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index bd867755e61..c1e2c1c92f4 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -22,22 +22,41 @@
 #include "utils/builtins.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
-
+#include "zenith/pagestore_client.h"
 
 PG_MODULE_MAGIC;
 
+extern void _PG_init(void);
+
 PG_FUNCTION_INFO_V1(test_consume_xids);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 
 /*
- * This function is defined in the zenith extension, such declaration is fragile.
+ * Linkage to functions in zenith module.
  * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
  */
-extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			XLogRecPtr request_lsn, bool request_latest, char *buffer);
 
+static zenith_read_at_lsn_type zenith_read_at_lsn_ptr;
+
+/*
+ * Module initialize function: fetch function pointers for cross-module calls.
+ */
+void
+_PG_init(void)
+{
+	/* Asserts verify that typedefs above match original declarations */
+	AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
+	zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
+		load_external_function("$libdir/zenith", "zenith_read_at_lsn",
+							   true, NULL);
+}
+
+#define zenith_read_at_lsn zenith_read_at_lsn_ptr
+
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
  */

From 6e96c376adfeb552e7ed8bba95eab9fba606fb39 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 21 Feb 2022 15:07:26 +0300
Subject: [PATCH 111/214] Add warning fr unrecgonized GUCs with zenith prefix

refer #1262
---
 contrib/zenith/libpagestore.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index d3e20fc6411..d8ec3eba81d 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -429,6 +429,7 @@ _PG_init(void)
 							NULL, NULL,	NULL);
 
 	relsize_hash_init();
+	EmitWarningsOnPlaceholders("zenith");
 
 	if (page_server != NULL)
 		zenith_log(ERROR, "libpqpagestore already loaded");

From 49a022a1d6ffd0a7ef356b737a9a7ad0e2e888f8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 8 Mar 2022 16:01:03 +0300
Subject: [PATCH 112/214] Count WAL flushes in walreceiver (#139)

---
 src/backend/replication/walreceiver.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index 4831a259c48..9b3f01207f8 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -494,6 +494,13 @@ WalReceiverMain(void)
 				if (endofwal)
 					break;
 
+				/*
+				 * Update WAL statistics, which are produced inside
+				 * issue_xlog_fsync function. This is useful for counting
+				 * WAL flushes, by querying pg_stat_wal.
+				 */
+				pgstat_send_wal(true);
+
 				/*
 				 * Ideally we would reuse a WaitEventSet object repeatedly
 				 * here to avoid the overheads of WaitLatchOrSocket on epoll

From 1a84f71ef5760b85b216fdcbf95b65f1c33a49a5 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 20:23:51 +0300
Subject: [PATCH 113/214] Use local relation cache for smgr_exists

refer  #1077
---
 contrib/zenith/pagestore_smgr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 7f3dc0bb09b..5ab935a4e0b 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -618,6 +618,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
 	ZenithResponse *resp;
+	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
@@ -644,6 +645,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
+	{
+		return true;
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -748,6 +754,9 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 * exist.
 	 */
 	mdunlink(rnode, forkNum, isRedo);
+	if (!RelFileNodeBackendIsTemp(rnode)) {
+		forget_cached_relsize(rnode.node, forkNum);
+	}
 }
 
 /*

From c9b6c96bb2a5188c43b54a4723bb533577de845e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 10 Mar 2022 16:05:05 +0200
Subject: [PATCH 114/214] Populate relsize cache when relation is created.

Postgres can perform an smgrnblocks() call on the relation right after
creating it, and we don't update the last-written LSN on smgrcreate().

Perhaps we should update last-written LSN, instead. This isn't
bulletproof.
---
 contrib/zenith/pagestore_smgr.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 5ab935a4e0b..afda2bd4767 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -720,6 +720,18 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 reln->smgr_rnode.node.relNode,
 		 forkNum);
 
+	/*
+	 * Newly created relation is empty, remember that in the relsize cache.
+	 *
+	 * FIXME: This is currently not just an optimization, but required for
+	 * correctness. Postgres can call smgrnblocks() on the newly-created
+	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
+	 * relation created, so if we didn't remember the size in the relsize
+	 * cache, we might call smgrnblocks() on the newly-created relation before
+	 * the creation WAL record hass been received by the page server.
+	 */
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdcreate(reln, forkNum, isRedo);

From 4d1b0cb4b0440d0b3b9ffd4eb1b3023acdb2260a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 28 Mar 2022 09:38:46 +0300
Subject: [PATCH 115/214] Fix pg_table_size() on a view

---
 contrib/zenith/pagestore_smgr.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index afda2bd4767..18c55fa5cdc 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -650,6 +650,23 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return true;
 	}
 
+	/*
+	 * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
+	 * will error out if you check that, because the whole dbdir for tablespace
+	 * 0, db 0 doesn't exists. We possibly should change the page server to
+	 * accept that and return 'false', to be consistent with mdexists(). But
+	 * we probably also should fix pg_table_size() to not call smgrexists()
+	 * with bogus relfilenode.
+	 *
+	 * For now, handle that special case here.
+	 */
+	if (reln->smgr_rnode.node.spcNode == 0 &&
+		reln->smgr_rnode.node.dbNode == 0 &&
+		reln->smgr_rnode.node.relNode == 0)
+	{
+		return false;
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {

From abb201e4bedf95a45581f096ed95c5e01dff0ecb Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 28 Mar 2022 18:10:05 +0400
Subject: [PATCH 116/214] Don't set commitLsn to truncateLsn.

It might jump back (on compute) this way, which is not fatal but violates sanity
checks.
---
 src/backend/replication/walproposer.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7752ae965d3..dfc4a538918 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -914,8 +914,6 @@ RecvAcceptorGreeting(Safekeeper *sk)
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
-	sk->appendResponse.flushLsn = truncateLsn;
-	sk->appendResponse.hs.ts = 0;
 
 	++n_connected;
 	if (n_connected <= quorum)

From 135880545c913f198c5f4cc90002c999c189118b Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 31 Mar 2022 18:46:43 +0300
Subject: [PATCH 117/214] Raise fatal error on failed recovery (#147)

---
 src/backend/replication/walproposer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index dfc4a538918..4d5092e94eb 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1257,10 +1257,14 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 					break;
 			}
 		}
-		ereport(DEBUG1,
+		ereport(LOG,
 				(errmsg("end of replication stream at %X/%X: %m",
 						LSN_FORMAT_ARGS(rec_end_lsn))));
 		walrcv_disconnect(wrconn);
+
+		/* failed to receive all WAL till endpos */
+		if (rec_end_lsn < endpos)
+			return false;
 	}
 	else
 	{

From afbc2a0d5e3dd3a3c96d317a0919bcb7fc15ce68 Mon Sep 17 00:00:00 2001
From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com>
Date: Fri, 1 Apr 2022 12:44:28 -0700
Subject: [PATCH 118/214] Enable dumping corrupt WAL segments (#145)

* Enable dumping corrupt WAL segments

 Add ability to dump WAL segment with corrupt page headers and recrods
 skips over missing/broken page headers
 skips over misformatted log recrods
 allows dumping log record from a particular file starting from an
optional offset
 (without a need of carefully crafted input)
---
 src/backend/access/transam/xlogreader.c | 102 ++++++++++---
 src/bin/pg_waldump/pg_waldump.c         | 194 ++++++++++++++++++++++--
 src/include/access/xlogreader.h         |   5 +
 3 files changed, 265 insertions(+), 36 deletions(-)

diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index f84569cdbab..988be3fff23 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -208,7 +208,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
 void
 XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
 {
-	Assert(!XLogRecPtrIsInvalid(RecPtr));
+	Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks);
 
 	ResetDecoder(state);
 
@@ -248,6 +248,14 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	bool		gotheader;
 	int			readOff;
 
+#define SKIP_INVALID_RECORD(rec_ptr)  do { \
+										rec_ptr = MAXALIGN(rec_ptr + 1); \
+										if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \
+											goto restart; \
+										else \
+											goto skip_invalid; \
+									} while (0);
+
 	/*
 	 * randAccess indicates whether to verify the previous-record pointer of
 	 * the record we're reading.  We only do this if we're reading
@@ -284,7 +292,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 		 * In this case, EndRecPtr should already be pointing to a valid
 		 * record starting position.
 		 */
-		Assert(XRecOffIsValid(RecPtr));
+		Assert(XRecOffIsValid(RecPtr) || state->skip_lsn_checks);
 		randAccess = true;
 	}
 
@@ -320,17 +328,23 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	}
 	else if (targetRecOff < pageHeaderSize)
 	{
-		report_invalid_record(state, "invalid record offset at %X/%X",
+		if(!state->skip_page_validation)
+		{
+			report_invalid_record(state, "invalid record offset at %X/%X",
 							  LSN_FORMAT_ARGS(RecPtr));
-		goto err;
+			goto err;
+		}
 	}
 
 	if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
 		targetRecOff == pageHeaderSize)
 	{
-		report_invalid_record(state, "contrecord is requested by %X/%X",
+		if(!state->skip_page_validation)
+		{
+			report_invalid_record(state, "contrecord is requested by %X/%X",
 							  LSN_FORMAT_ARGS(RecPtr));
-		goto err;
+			goto err;
+		}
 	}
 
 	/* ReadPageInternal has verified the page header */
@@ -345,6 +359,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	 * cannot access any other fields until we've verified that we got the
 	 * whole header.
 	 */
+skip_invalid:
 	record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
 	total_len = record->xl_tot_len;
 
@@ -360,7 +375,13 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	{
 		if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record,
 								   randAccess))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
+
 		gotheader = true;
 	}
 	else
@@ -368,11 +389,17 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 		/* There may be no next page if it's too small. */
 		if (total_len < SizeOfXLogRecord)
 		{
-			report_invalid_record(state,
-								  "invalid record length at %X/%X: wanted %u, got %u",
-								  LSN_FORMAT_ARGS(RecPtr),
-								  (uint32) SizeOfXLogRecord, total_len);
-			goto err;
+			if(!state->skip_invalid_records)
+			{
+				report_invalid_record(state,
+						"invalid record length at %X/%X: wanted %u, got %u",
+						LSN_FORMAT_ARGS(RecPtr),
+						(uint32) SizeOfXLogRecord, total_len);
+
+				goto err;
+			}
+
+			SKIP_INVALID_RECORD(RecPtr);
 		}
 		/* We'll validate the header once we have the next page. */
 		gotheader = false;
@@ -438,10 +465,15 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 			/* Check that the continuation on next page looks valid */
 			if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
 			{
-				report_invalid_record(state,
+				if(!state->skip_invalid_records)
+				{
+					report_invalid_record(state,
 									  "there is no contrecord flag at %X/%X",
 									  LSN_FORMAT_ARGS(RecPtr));
-				goto err;
+					goto err;
+				}
+
+				SKIP_INVALID_RECORD(RecPtr);
 			}
 
 			/*
@@ -451,12 +483,17 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 			if (pageHeader->xlp_rem_len == 0 ||
 				total_len != (pageHeader->xlp_rem_len + gotlen))
 			{
-				report_invalid_record(state,
+				if(!state->skip_invalid_records)
+				{
+					report_invalid_record(state,
 									  "invalid contrecord length %u (expected %lld) at %X/%X",
 									  pageHeader->xlp_rem_len,
 									  ((long long) total_len) - gotlen,
 									  LSN_FORMAT_ARGS(RecPtr));
-				goto err;
+					goto err;
+				}
+
+				SKIP_INVALID_RECORD(RecPtr);
 			}
 
 			/* Append the continuation from this page to the buffer */
@@ -487,7 +524,13 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 				record = (XLogRecord *) state->readRecordBuf;
 				if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr,
 										   record, randAccess))
-					goto err;
+				{
+					if(!state->skip_invalid_records)
+						goto err;
+
+					SKIP_INVALID_RECORD(RecPtr);
+				}
+
 				gotheader = true;
 			}
 
@@ -518,7 +561,12 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 
 		record = (XLogRecord *) state->readRecordBuf;
 		if (!ValidXLogRecord(state, record, RecPtr))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
 
 		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
 		state->ReadRecPtr = RecPtr;
@@ -535,7 +583,12 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 
 		/* Record does not cross a page boundary */
 		if (!ValidXLogRecord(state, record, RecPtr))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
 
 		state->EndRecPtr = RecPtr + MAXALIGN(total_len);
 
@@ -639,8 +692,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
 		/* we can be sure to have enough WAL available, we scrolled back */
 		Assert(readLen == XLOG_BLCKSZ);
 
-		if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
-										  state->readBuf))
+		if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, state->readBuf) && !state->skip_page_validation)
 			goto err;
 	}
 
@@ -677,7 +729,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
 	/*
 	 * Now that we know we have the full header, validate it.
 	 */
-	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
+	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) && !state->skip_page_validation)
 		goto err;
 
 	/* update read state information */
@@ -735,7 +787,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		 * We can't exactly verify the prev-link, but surely it should be less
 		 * than the record's own address.
 		 */
-		if (!(record->xl_prev < RecPtr))
+		if (!(record->xl_prev < RecPtr)  && !state->skip_lsn_checks)
 		{
 			report_invalid_record(state,
 								  "record with incorrect prev-link %X/%X at %X/%X",
@@ -751,7 +803,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		 * check guards against torn WAL pages where a stale but valid-looking
 		 * WAL record starts on a sector boundary.
 		 */
-		if (record->xl_prev != PrevRecPtr)
+		if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks)
 		{
 			report_invalid_record(state,
 								  "record with incorrect prev-link %X/%X at %X/%X",
@@ -896,7 +948,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 	 * check typically fails when an old WAL segment is recycled, and hasn't
 	 * yet been overwritten with new data yet.
 	 */
-	if (hdr->xlp_pageaddr != recaddr)
+	if (hdr->xlp_pageaddr != recaddr && !state->skip_lsn_checks)
 	{
 		char		fname[MAXFNAMELEN];
 
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index 2daed328e7d..bb4062e17ea 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -13,9 +13,11 @@
 #include "postgres.h"
 
 #include <dirent.h>
+#include <limits.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
+
 #include "access/transam.h"
 #include "access/xlog_internal.h"
 #include "access/xlogreader.h"
@@ -23,8 +25,11 @@
 #include "common/fe_memutils.h"
 #include "common/logging.h"
 #include "getopt_long.h"
+#include "port/pg_bitutils.h"
 #include "rmgrdesc.h"
 
+#define OFFSET_INVALID ((size_t)-1)
+
 static const char *progname;
 
 static int	WalSegSz;
@@ -35,6 +40,7 @@ typedef struct XLogDumpPrivate
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
 	bool		endptr_reached;
+	char* 		input_filename;
 } XLogDumpPrivate;
 
 typedef struct XLogDumpConfig
@@ -52,6 +58,7 @@ typedef struct XLogDumpConfig
 	int			filter_by_rmgr;
 	TransactionId filter_by_xid;
 	bool		filter_by_xid_enabled;
+	bool		ignore_format_errors;
 } XLogDumpConfig;
 
 typedef struct Stats
@@ -70,8 +77,36 @@ typedef struct XLogDumpStats
 	Stats		record_stats[RM_NEXT_ID][MAX_XLINFO_TYPES];
 } XLogDumpStats;
 
+
 #define fatal_error(...) do { pg_log_fatal(__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
 
+/* calculate ceil(log base 2) of num */
+static int
+my_log2(long num)
+{
+	/*
+	 * guard against too-large input, which would be invalid for
+	 * pg_ceil_log2_*()
+	 */
+	if (num > LONG_MAX / 2)
+		num = LONG_MAX / 2;
+
+#if SIZEOF_LONG < 8
+	return pg_ceil_log2_32(num);
+#else
+	return pg_ceil_log2_64(num);
+#endif
+}
+
+/* calculate first power of 2 >= num, bounded to what will fit in an int */
+static int
+next_pow2_int(long num)
+{
+	if (num > INT_MAX / 2)
+		num = INT_MAX / 2;
+	return 1 << my_log2(num);
+}
+
 static void
 print_rmgr_list(void)
 {
@@ -284,6 +319,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo,
 	TimeLineID	tli = *tli_p;
 	char		fname[MAXPGPATH];
 	int			tries;
+	XLogDumpPrivate *private = state->private_data;
+
+	if(private->input_filename)
+	{
+		Assert(nextSegNo == 0);
+
+		state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename);
+		if (state->seg.ws_file >= 0)
+			return;
+
+		fatal_error("could not open file \"%s\": %m", private->input_filename);
+	}
 
 	XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
 
@@ -354,6 +401,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
 	{
 		WALOpenSegment *seg = &errinfo.wre_seg;
 		char		fname[MAXPGPATH];
+		char		*actual_fname = private->input_filename ? private->input_filename : fname;
 
 		XLogFileName(fname, seg->ws_tli, seg->ws_segno,
 					 state->segcxt.ws_segsize);
@@ -362,11 +410,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
 		{
 			errno = errinfo.wre_errno;
 			fatal_error("could not read from file %s, offset %u: %m",
-						fname, errinfo.wre_off);
+						actual_fname, errinfo.wre_off);
 		}
 		else
 			fatal_error("could not read from file %s, offset %u: read %d of %zu",
-						fname, errinfo.wre_off, errinfo.wre_read,
+						actual_fname, errinfo.wre_off, errinfo.wre_read,
 						(Size) errinfo.wre_req);
 	}
 
@@ -465,16 +513,25 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
 	int			block_id;
 	uint8		info = XLogRecGetInfo(record);
 	XLogRecPtr	xl_prev = XLogRecGetPrev(record);
+	XLogDumpPrivate *private = record->private_data;
 	StringInfoData s;
 
 	XLogDumpRecordLen(record, &rec_len, &fpi_len);
 
-	printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
+	if(private->input_filename)
+		printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: 0x%lX, prev %X/%08X, ",
 		   desc->rm_name,
 		   rec_len, XLogRecGetTotalLen(record),
 		   XLogRecGetXid(record),
-		   LSN_FORMAT_ARGS(record->ReadRecPtr),
+		   record->ReadRecPtr,
 		   LSN_FORMAT_ARGS(xl_prev));
+	else
+		printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
+			desc->rm_name,
+			rec_len, XLogRecGetTotalLen(record),
+			XLogRecGetXid(record),
+			LSN_FORMAT_ARGS(record->ReadRecPtr),
+			LSN_FORMAT_ARGS(xl_prev));
 
 	id = desc->rm_identify(info);
 	if (id == NULL)
@@ -726,7 +783,10 @@ usage(void)
 	printf(_("  -b, --bkp-details      output detailed information about backup blocks\n"));
 	printf(_("  -e, --end=RECPTR       stop reading at WAL location RECPTR\n"));
 	printf(_("  -f, --follow           keep retrying after reaching end of WAL\n"));
+	printf(_("  -F, --file=FNAME       dump log records from a single file\n"));
+	printf(_("  -i, --ignore           ignore format errors, skip invalid structures\n"));
 	printf(_("  -n, --limit=N          number of records to display\n"));
+	printf(_("  -o, --offset=OFFSET    offset of the first record to in a file to dump\n"));
 	printf(_("  -p, --path=PATH        directory in which to find log segment files or a\n"
 			 "                         directory with a ./pg_wal that contains such files\n"
 			 "                         (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n"));
@@ -757,14 +817,20 @@ main(int argc, char **argv)
 	XLogRecord *record;
 	XLogRecPtr	first_record;
 	char	   *waldir = NULL;
+	char	   *fname = NULL;
 	char	   *errormsg;
+	bool 		single_file = false;
+	size_t		start_offset = OFFSET_INVALID;
 
 	static struct option long_options[] = {
 		{"bkp-details", no_argument, NULL, 'b'},
 		{"end", required_argument, NULL, 'e'},
 		{"follow", no_argument, NULL, 'f'},
+		{"file", required_argument, NULL, 'F'},
 		{"help", no_argument, NULL, '?'},
+		{"ignore", no_argument, NULL, 'i'},
 		{"limit", required_argument, NULL, 'n'},
+		{"offset", required_argument, NULL, 'o'},
 		{"path", required_argument, NULL, 'p'},
 		{"quiet", no_argument, NULL, 'q'},
 		{"rmgr", required_argument, NULL, 'r'},
@@ -805,6 +871,7 @@ main(int argc, char **argv)
 	private.startptr = InvalidXLogRecPtr;
 	private.endptr = InvalidXLogRecPtr;
 	private.endptr_reached = false;
+	private.input_filename = NULL;
 
 	config.quiet = false;
 	config.bkp_details = false;
@@ -816,6 +883,7 @@ main(int argc, char **argv)
 	config.filter_by_xid_enabled = false;
 	config.stats = false;
 	config.stats_per_record = false;
+	config.ignore_format_errors = false;
 
 	if (argc <= 1)
 	{
@@ -823,7 +891,7 @@ main(int argc, char **argv)
 		goto bad_argument;
 	}
 
-	while ((option = getopt_long(argc, argv, "be:fn:p:qr:s:t:x:z",
+	while ((option = getopt_long(argc, argv, "be:fF:in:o:p:qr:s:t:x:z",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -843,6 +911,13 @@ main(int argc, char **argv)
 			case 'f':
 				config.follow = true;
 				break;
+			case 'F':
+				fname = pg_strdup(optarg);
+				single_file = true;
+				break;
+			case 'i':
+				config.ignore_format_errors = true;
+				break;
 			case 'n':
 				if (sscanf(optarg, "%d", &config.stop_after_records) != 1)
 				{
@@ -850,6 +925,13 @@ main(int argc, char **argv)
 					goto bad_argument;
 				}
 				break;
+			case 'o':
+				if (sscanf(optarg, "%zu", &start_offset) != 1)
+				{
+					pg_log_error("could not parse offset \"%s\"", optarg);
+					goto bad_argument;
+				}
+				break;
 			case 'p':
 				waldir = pg_strdup(optarg);
 				break;
@@ -936,6 +1018,73 @@ main(int argc, char **argv)
 		goto bad_argument;
 	}
 
+	if (start_offset != OFFSET_INVALID)
+	{
+		if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr))
+		{
+			pg_log_error("either file offset or start/end pointers should be specified");
+			goto bad_argument;
+		}
+
+		if(!single_file)
+		{
+			pg_log_error("offset option could only be used with filename option");
+			goto bad_argument;
+		}
+
+		/* Log records are maxaligned, start at the closest next position */
+		private.startptr = MAXALIGN(start_offset);
+	}
+
+	if(single_file)
+	{
+		char	   *directory = NULL;
+		int			fd;
+		struct stat stat;
+
+		if(config.follow)
+		{
+			pg_log_error("Follow could not be used in file dump mode");
+			goto bad_argument;
+		}
+
+		if (waldir != NULL)
+		{
+			pg_log_error("either single file or wal directory should be specified");
+			goto bad_argument;
+		}
+
+		split_path(fname, &directory, &private.input_filename);
+		waldir = directory;
+
+		if(waldir == NULL)
+		{
+			char *cwd = malloc(MAXPGPATH);
+
+			if (!getcwd(cwd, MAXPGPATH))
+				fatal_error("could identify current directory: %m");
+
+			waldir = cwd;
+		}
+
+		if (!verify_directory(waldir))
+			fatal_error("could not open directory \"%s\": %m", waldir);
+
+		fd = open_file_in_directory(waldir, private.input_filename);
+		if (fd < 0)
+			fatal_error("could not open file \"%s\"", private.input_filename);
+
+		if(fstat(fd, &stat) != 0)
+			fatal_error("could not stat file \"%s\"", private.input_filename);
+
+		private.endptr = stat.st_size;
+
+		/* Round up segment size to next power of 2 or 1MB */
+		WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024);
+
+		close(fd);
+	}
+
 	if (waldir != NULL)
 	{
 		/* validate path points to directory */
@@ -954,6 +1103,12 @@ main(int argc, char **argv)
 		int			fd;
 		XLogSegNo	segno;
 
+		if(single_file)
+		{
+			pg_log_error("either single file or start/end boundaries should be specified");
+			goto bad_argument;
+		}
+
 		split_path(argv[optind], &directory, &fname);
 
 		if (waldir == NULL && directory != NULL)
@@ -1026,10 +1181,11 @@ main(int argc, char **argv)
 		}
 	}
 	else
-		waldir = identify_target_directory(waldir, NULL);
+		if (!single_file)
+			waldir = identify_target_directory(waldir, NULL);
 
 	/* we don't know what to print */
-	if (XLogRecPtrIsInvalid(private.startptr))
+	if (XLogRecPtrIsInvalid(private.startptr) && !single_file)
 	{
 		pg_log_error("no start WAL location given");
 		goto bad_argument;
@@ -1047,12 +1203,28 @@ main(int argc, char **argv)
 	if (!xlogreader_state)
 		fatal_error("out of memory");
 
-	/* first find a valid recptr to start from */
-	first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+	if(single_file)
+	{
+		if(config.ignore_format_errors)
+		{
+			xlogreader_state->skip_page_validation = true;
+			xlogreader_state->skip_invalid_records = true;
+		}
+
+		xlogreader_state->skip_lsn_checks = true;
 
-	if (first_record == InvalidXLogRecPtr)
-		fatal_error("could not find a valid record after %X/%X",
+		first_record = private.startptr;
+		XLogBeginRead(xlogreader_state, first_record);
+	}
+	else
+	{
+		/* first find a valid recptr to start from */
+		first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+
+		if (first_record == InvalidXLogRecPtr)
+			fatal_error("could not find a valid record after %X/%X",
 					LSN_FORMAT_ARGS(private.startptr));
+	}
 
 	/*
 	 * Display a message that we're skipping data if `from` wasn't a pointer
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 10458c23eda..c7fac7bdace 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -262,6 +262,11 @@ struct XLogReaderState
 	XLogRecPtr	missingContrecPtr;
 	/* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */
 	XLogRecPtr	overwrittenRecPtr;
+
+	/* Disable validation to allow dumpng corrupt WAL */
+	bool skip_page_validation;
+	bool skip_invalid_records;
+	bool skip_lsn_checks;
 };
 
 /* Get a new XLogReader */

From 87dae2a4d6a6df7913d3f9eb69c2c9a80b5713f4 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 4 Apr 2022 15:52:29 +0300
Subject: [PATCH 119/214] Don't hold walproposer WAL in memory (#141)

WAL is no longer in memory to prevent OOM in the compute. Removed in-memory queue because it's not needed anymore. When streaming, WAL is now read directly from disk. Every safekeeper has a separate XLogReader. walproposer will now read as much WAL as it can for a single AppendRequest message, it can help with recovering lagging safekeepers. Because Recovery needs to save WAL for streaming, now walproposer can write WAL to disk and `--sync-safekeepers` mode will create pg_wal directory if needed. Replication slot `restart_lsn` is now synced with `truncate_lsn` to prevent truncation of disk WAL until needed.
---
 src/backend/replication/walproposer.c       | 318 ++++++++------------
 src/backend/replication/walproposer_utils.c | 106 +++++++
 src/backend/replication/walsender.c         | 123 ++++----
 src/include/replication/walproposer.h       |  52 ++--
 4 files changed, 323 insertions(+), 276 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4d5092e94eb..4843b10e1d9 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -37,7 +37,9 @@
 
 #include <signal.h>
 #include <unistd.h>
+#include <sys/stat.h>
 #include "access/xlogdefs.h"
+#include "access/xlogutils.h"
 #include "replication/walproposer.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
@@ -73,12 +75,8 @@ WalProposerFunctionsType *WalProposerFunctions = NULL;
 static int	n_safekeepers = 0;
 static int	quorum = 0;
 static Safekeeper safekeeper[MAX_SAFEKEEPERS];
-static WalMessage *msgQueueHead;
-static WalMessage *msgQueueTail;
-static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
-								 * this point */
-static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
-										 * safekeepers */
+static XLogRecPtr availableLsn;	/* WAL has been generated up to this point */
+static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */
 static ProposerGreeting greetRequest;
 static VoteRequest voteRequest; /* Vote request for safekeeper */
 static WaitEventSet *waitEvents;
@@ -134,10 +132,8 @@ static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr start
 static void SendProposerElected(Safekeeper *sk);
 static void WalProposerStartStreaming(XLogRecPtr startpos);
 static void StartStreaming(Safekeeper *sk);
-static void SendMessageToNode(Safekeeper *sk, WalMessage *msg);
-static void BroadcastMessage(WalMessage *msg);
-static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
-static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void SendMessageToNode(Safekeeper *sk);
+static void BroadcastAppendRequest(void);
 static void HandleActiveState(Safekeeper *sk, uint32 events);
 static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
@@ -198,7 +194,10 @@ WalProposerMain(Datum main_arg)
 void
 WalProposerSync(int argc, char *argv[])
 {
+	struct stat stat_buf;
+
 	syncSafekeepers = true;
+	ThisTimeLineID = 1;
 
 	InitStandaloneProcess(argv[0]);
 
@@ -233,6 +232,22 @@ WalProposerSync(int argc, char *argv[])
 				(errcode_for_socket_access(),
 				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
 
+	ChangeToDataDir();
+
+	/* Create pg_wal directory, if it doesn't exist */
+	if (stat(XLOGDIR, &stat_buf) != 0)
+	{
+		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
+		if (MakePGDirectory(XLOGDIR) < 0)
+		{
+			ereport(ERROR,
+					(errcode_for_file_access(),
+						errmsg("could not create directory \"%s\": %m",
+							XLOGDIR)));
+			exit(1);
+		}
+	}
+
 	WalProposerInit(0, 0);
 
 	process_shared_preload_libraries_in_progress = false;
@@ -247,12 +262,11 @@ WalProposerSync(int argc, char *argv[])
  * called from walsender every time the new WAL is available.
  */
 void
-WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
+WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos)
 {
-	WalMessage *msg = CreateMessage(startpos, data, len);
-
-	if (msg != NULL)
-		BroadcastMessage(msg);
+	Assert(startpos == availableLsn && endpos >= availableLsn);
+	availableLsn = endpos;
+	BroadcastAppendRequest();
 }
 
 /*
@@ -303,9 +317,9 @@ WalProposerPoll(void)
 			 * If no WAL was generated during timeout (and we have already
 			 * collected the quorum), then send pool message
 			 */
-			if (lastSentLsn != InvalidXLogRecPtr)
+			if (availableLsn != InvalidXLogRecPtr)
 			{
-				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+				BroadcastAppendRequest();
 			}
 		}
 	}
@@ -379,9 +393,12 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 */
 		safekeeper[n_safekeepers].conninfo[0] = '\0';
 		initStringInfo(&safekeeper[n_safekeepers].outbuf);
+		safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL);
+		if (safekeeper[n_safekeepers].xlogreader == NULL)
+			elog(FATAL, "Failed to allocate xlog reader");
 		safekeeper[n_safekeepers].flushWrite = false;
-		safekeeper[n_safekeepers].currMsg = NULL;
 		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
+		safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
 		n_safekeepers += 1;
 	}
 	if (n_safekeepers < 1)
@@ -513,7 +530,7 @@ ShutdownConnection(Safekeeper *sk)
 	sk->conn = NULL;
 	sk->state = SS_OFFLINE;
 	sk->flushWrite = false;
-	sk->currMsg = NULL;
+	sk->streamingAt = InvalidXLogRecPtr;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
@@ -1087,13 +1104,13 @@ HandleElectedProposer(void)
 	if (syncSafekeepers)
 	{
 		/*
-			* Queue empty message to enforce receiving feedback
-			* even from nodes who are fully recovered; this is
-			* required to learn they switched epoch which finishes
-			* sync-safeekepers who doesn't generate any real new
-			* records. Will go away once we switch to async acks.
-			*/
-		BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+		 * Send empty message to enforce receiving feedback
+		 * even from nodes who are fully recovered; this is
+		 * required to learn they switched epoch which finishes
+		 * sync-safeekepers who doesn't generate any real new
+		 * records. Will go away once we switch to async acks.
+		 */
+		BroadcastAppendRequest();
 
 		/* keep polling until all safekeepers are synced */
 		return;
@@ -1172,6 +1189,12 @@ DetermineEpochStartLsn(void)
 	Assert((truncateLsn != InvalidXLogRecPtr) ||
 		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
 
+	/*
+	 * We will be generating WAL since propEpochStartLsn, so we should set
+	 * availableLsn to mark this LSN as the latest available position.
+	 */
+	availableLsn = propEpochStartLsn;
+
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
@@ -1249,7 +1272,10 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 					   sizeof rec_start_lsn);
 				rec_start_lsn = pg_ntoh64(rec_start_lsn);
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-				(void) CreateMessage(rec_start_lsn, buf, len);
+
+				/* write WAL to disk */
+				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
+
 				ereport(DEBUG1,
 						(errmsg("Recover message %X/%X length %d",
 								LSN_FORMAT_ARGS(rec_start_lsn), len)));
@@ -1374,7 +1400,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}
 
-	Assert(msgQueueHead == NULL || sk->startStreamingAt >= msgQueueHead->req.beginLsn);
+	Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn);
 
 	msg.tag = 'e';
 	msg.term = propTerm;
@@ -1426,39 +1452,29 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 StartStreaming(Safekeeper *sk)
 {
-	WalMessage *startMsg = msgQueueHead;
-
 	/* 
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
 	 * exactly once for a connection.
 	 */
 	sk->state = SS_ACTIVE;
-
-	while (startMsg != NULL && startMsg->req.endLsn <= sk->startStreamingAt)
-		startMsg = startMsg->next;
-
-	/* We should always have WAL to start from sk->startStreamingAt */
-	Assert(startMsg == NULL || startMsg->req.beginLsn <= sk->startStreamingAt);
+	sk->streamingAt = sk->startStreamingAt;
 
 	/* event set will be updated inside SendMessageToNode */
-	SendMessageToNode(sk, startMsg);
+	SendMessageToNode(sk);
 }
 
 /*
- * Start sending message to the particular node. Always updates event set.
+ * Try to send message to the particular node. Always updates event set. Will
+ * send at least one message, if socket is ready.
  *
  * Can be used only for safekeepers in SS_ACTIVE state. State can be changed
  * in case of errors.
  */
 static void
-SendMessageToNode(Safekeeper *sk, WalMessage *msg)
+SendMessageToNode(Safekeeper *sk)
 {
-	/* we shouldn't be already sending something */
-	Assert(sk->currMsg == NULL);
 	Assert(sk->state == SS_ACTIVE);
 
-	sk->currMsg = msg;
-
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
 	HandleActiveState(sk, WL_SOCKET_WRITEABLE);
 }
@@ -1467,95 +1483,25 @@ SendMessageToNode(Safekeeper *sk, WalMessage *msg)
  * Broadcast new message to all caught-up safekeepers
  */
 static void
-BroadcastMessage(WalMessage *msg)
+BroadcastAppendRequest()
 {
 	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].state == SS_ACTIVE && safekeeper[i].currMsg == NULL)
-		{
-			SendMessageToNode(&safekeeper[i], msg);
-		}
-	}
-}
-
-static WalMessage *
-CreateMessage(XLogRecPtr startpos, char *data, int len)
-{
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
-	XLogRecPtr	endpos;
-
-	len -= XLOG_HDR_SIZE;
-	endpos = startpos + len;
-	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
-	{
-		/* Message already queued */
-		return NULL;
-	}
-	Assert(len >= 0);
-	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
-
-	msg->size = sizeof(AppendRequestHeader) + len;
-	msg->next = NULL;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
-	msg->req.beginLsn = startpos;
-	msg->req.endLsn = endpos;
-	msg->req.proposerId = greetRequest.proposerId;
-	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
-
-	Assert(msg->req.endLsn >= lastSentLsn);
-	lastSentLsn = msg->req.endLsn;
-	return msg;
+		if (safekeeper[i].state == SS_ACTIVE)
+			SendMessageToNode(&safekeeper[i]);
 }
 
-/*
- * Create WAL message with no data, just to let the safekeepers
- * know that commit lsn has advanced.
- */
-static WalMessage *
-CreateMessageCommitLsnOnly(XLogRecPtr lsn)
+static void
+PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
 {
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
-
-	msg = (WalMessage *) malloc(sizeof(WalMessage));
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
-
-	msg->size = sizeof(AppendRequestHeader);
-	msg->next = NULL;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
-
-	/*
-	 * This serves two purposes: 1) After all msgs from previous epochs are
-	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
-	 * commands to switch the epoch, which allows to do the switch without
-	 * creating new epoch records (we especially want to avoid such in --sync
-	 * mode). Walproposer can advance commit_lsn only after the switch, so
-	 * this lsn (reported back) also is the first possible advancement point.
-	 * 2) Maintain common invariant of queue entries sorted by LSN.
-	 */
-	msg->req.beginLsn = lsn;
-	msg->req.endLsn = lsn;
-	msg->req.proposerId = greetRequest.proposerId;
-
-	/*
-	 * truncateLsn and commitLsn are set just before the message sent, in
-	 * SendAppendRequests()
-	 */
-	return msg;
+	Assert(endLsn >= beginLsn);
+	req->tag = 'a';
+	req->term = propTerm;
+	req->epochStartLsn = propEpochStartLsn;
+	req->beginLsn = beginLsn;
+	req->endLsn = endLsn;
+	req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+	req->truncateLsn = truncateLsn;
+	req->proposerId = greetRequest.proposerId;
 }
 
 /*
@@ -1578,20 +1524,22 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
 	 * in the buffer.
 	 * 
-	 * sk->currMsg checks if we have pending unsent messages. This check isn't
-	 * necessary now, because we always send queue messages immediately after
-	 * creation. But it's good to have it here in case we change this behavior
+	 * LSN comparison checks if we have pending unsent messages. This check isn't
+	 * necessary now, because we always send append messages immediately after
+	 * arrival. But it's good to have it here in case we change this behavior
 	 * in the future.
 	 */
-	if (sk->currMsg != NULL || sk->flushWrite)
+	if (sk->streamingAt != availableLsn || sk->flushWrite)
 		newEvents |= WL_SOCKET_WRITEABLE;
 
 	UpdateEventSet(sk, newEvents);
 }
 
 /*
- * Send queue messages starting from sk->currMsg until the end or non-writable
+ * Send WAL messages starting from sk->streamingAt until the end or non-writable
  * socket, whichever comes first. Caller should take care of updating event set.
+ * Even if no unsent WAL is available, at least one empty message will be sent 
+ * as a heartbeat, if socket is ready.
  * 
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
@@ -1599,9 +1547,11 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 static bool
 SendAppendRequests(Safekeeper *sk)
 {
-	WalMessage *msg;
+	XLogRecPtr endLsn;
 	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
+	WALReadError errinfo;
+	bool sentAnything = false;
 
 	if (sk->flushWrite)
 	{
@@ -1616,37 +1566,21 @@ SendAppendRequests(Safekeeper *sk)
 		sk->flushWrite = false;
 	}
 
-	while (sk->currMsg)
+	while (sk->streamingAt != availableLsn || !sentAnything)
 	{
-		msg = sk->currMsg;
-		req = &msg->req;
+		sentAnything = true;
 
-		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
-		req->truncateLsn = truncateLsn;
+		endLsn = sk->streamingAt;
+		endLsn += MAX_SEND_SIZE;
 
-		/*
-		 * If we need to send this message not from the beginning,
-		 * form the cut version. Only happens for the first
-		 * message.
-		 */
-		if (sk->startStreamingAt > msg->req.beginLsn)
-		{
-			uint32		len;
-			uint32		size;
-
-			Assert(sk->startStreamingAt < req->endLsn);
-
-			len = msg->req.endLsn - sk->startStreamingAt;
-			size = sizeof(AppendRequestHeader) + len;
-			req = malloc(size);
-			*req = msg->req;
-			req->beginLsn = sk->startStreamingAt;
-			memcpy(req + 1,
-					(char *) (&msg->req + 1) + sk->startStreamingAt -
-					msg->req.beginLsn,
-					len);
+		/* if we went beyond available WAL, back off */
+		if (endLsn > availableLsn) {
+			endLsn = availableLsn;
 		}
 
+		req = &sk->appendRequest;
+		PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn);
+
 		ereport(DEBUG2,
 				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 						req->endLsn - req->beginLsn,
@@ -1655,19 +1589,28 @@ SendAppendRequests(Safekeeper *sk)
 						LSN_FORMAT_ARGS(req->commitLsn),
 						LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port)));
 
-		/*
-		 * We write with msg->size here because the body of the
-		 * message is stored after the end of the WalMessage
-		 * struct, in the allocation for each msg
-		 */
-		writeResult = walprop_async_write(sk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
-		
-		/* Free up resources */
-		if (req != &msg->req)
-			free(req);
+		resetStringInfo(&sk->outbuf);
 
+		/* write AppendRequest header */
+		appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader));
+
+		/* write the WAL itself */
+		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+		if (!WALRead(sk->xlogreader,
+				 &sk->outbuf.data[sk->outbuf.len],
+				 req->beginLsn,
+				 req->endLsn - req->beginLsn,
+				 ThisTimeLineID,
+				 &errinfo))
+		{
+			WALReadRaiseError(&errinfo);
+		}
+		sk->outbuf.len += req->endLsn - req->beginLsn;
+
+		writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
+		
 		/* Mark current message as sent, whatever the result is */
-		sk->currMsg = sk->currMsg->next;
+		sk->streamingAt = endLsn;
 
 		switch (writeResult)
 		{
@@ -1723,6 +1666,13 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
+		ereport(DEBUG2,
+				(errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+						sk->appendResponse.term,
+						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+						sk->host, sk->port)));
+
 		readAnything = true;
 	}
 
@@ -1733,14 +1683,11 @@ RecvAppendResponses(Safekeeper *sk)
 
 	/*
 	 * Also send the new commit lsn to all the safekeepers.
-	 *
-	 * FIXME: This is redundant for safekeepers that have other
-	 * outbound messages pending.
 	 */
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastSentCommitLsn)
 	{
-		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+		BroadcastAppendRequest();
 		lastSentCommitLsn = minQuorumLsn;
 	}
 
@@ -2033,25 +1980,16 @@ HandleSafekeeperResponse(void)
 	 */
 	minFlushLsn = CalculateMinFlushLsn();
 	if (minFlushLsn > truncateLsn)
-		truncateLsn = minFlushLsn;
-
-	/*
-	 * Cleanup message queue up to truncateLsn. These messages were processed
-	 * by all safekeepers because they all reported flushLsn greater than endLsn.
-	 */
-	while (msgQueueHead != NULL && msgQueueHead->req.endLsn < truncateLsn)
 	{
-		WalMessage *msg = msgQueueHead;
-		msgQueueHead = msg->next;
+		truncateLsn = minFlushLsn;
 
-		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
-		free(msg);
+		/*
+		 * Advance the replication slot to free up old WAL files. Note
+		 * that slot doesn't exist if we are in syncSafekeepers mode.
+		 */
+		if (MyReplicationSlot)
+			PhysicalConfirmReceivedLocation(truncateLsn);
 	}
-	if (!msgQueueHead)			/* queue is empty */
-		msgQueueTail = NULL;
-
-	/* truncateLsn always points to the first chunk in the queue */
-	Assert(msgQueueHead == NULL || (truncateLsn >= msgQueueHead->req.beginLsn && truncateLsn <= msgQueueHead->req.endLsn));
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 7a593a71778..c9ddafdee0c 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -8,6 +8,15 @@
 #include <netinet/tcp.h>
 #include <unistd.h>
 
+/*
+ * These variables are used similarly to openLogFile/SegNo,
+ * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
+ * corresponding the filename of walpropFile.
+ */
+static int	walpropFile = -1;
+static TimeLineID walpropFileTLI = 0;
+static XLogSegNo walpropSegNo = 0;
+
 int
 CompareLsn(const void *a, const void *b)
 {
@@ -294,3 +303,100 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
 }
+
+/*
+ * Write XLOG data to disk.
+ */
+void
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
+{
+	int			startoff;
+	int			byteswritten;
+
+	while (nbytes > 0)
+	{
+		int			segbytes;
+
+		/* Close the current segment if it's completed */
+		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+			XLogWalPropClose(recptr);
+
+		if (walpropFile < 0)
+		{
+			bool		use_existent = true;
+
+			/* Create/use new log file */
+			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
+			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
+			walpropFileTLI = ThisTimeLineID;
+		}
+
+		/* Calculate the start offset of the received logs */
+		startoff = XLogSegmentOffset(recptr, wal_segment_size);
+
+		if (startoff + nbytes > wal_segment_size)
+			segbytes = wal_segment_size - startoff;
+		else
+			segbytes = nbytes;
+
+		/* OK to write the logs */
+		errno = 0;
+
+		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
+		if (byteswritten <= 0)
+		{
+			char		xlogfname[MAXFNAMELEN];
+			int			save_errno;
+
+			/* if write didn't set errno, assume no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+
+			save_errno = errno;
+			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+			errno = save_errno;
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not write to log segment %s "
+							"at offset %u, length %lu: %m",
+							xlogfname, startoff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for write */
+		recptr += byteswritten;
+
+		nbytes -= byteswritten;
+		buf += byteswritten;
+	}
+
+	/*
+	 * Close the current segment if it's fully written up in the last cycle of
+	 * the loop.
+	 */
+	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+	{
+		XLogWalPropClose(recptr);
+	}
+}
+
+/*
+ * Close the current segment.
+ */
+void
+XLogWalPropClose(XLogRecPtr recptr)
+{
+	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
+
+	if (close(walpropFile) != 0)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close log segment %s: %m",
+						xlogfname)));
+	}
+
+	walpropFile = -1;
+}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 2c4bcae0ab7..a3dc11c41bc 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1892,7 +1892,7 @@ ProcessStandbyMessage(void)
 /*
  * Remember that a walreceiver just confirmed receipt of lsn `lsn`.
  */
-static void
+void
 PhysicalConfirmReceivedLocation(XLogRecPtr lsn)
 {
 	bool		changed = false;
@@ -2058,6 +2058,13 @@ ProcessStandbyReply(XLogRecPtr	writePtr,
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 
+	/* 
+	 * walproposer use trunclateLsn instead of flushPtr for confirmed
+	 * received location, so we shouldn't update restart_lsn here.
+	 */
+	if (am_wal_proposer)
+		return;
+
 	/*
 	 * Advance our local xmin horizon when the client confirmed a flush.
 	 */
@@ -2858,73 +2865,73 @@ XLogSendPhysical(void)
 	nbytes = endptr - startptr;
 	Assert(nbytes <= MAX_SEND_SIZE);
 
-	/*
-	 * OK to read and send the slice.
-	 */
-	if (output_message.data)
-		resetStringInfo(&output_message);
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, endptr);
+	}
 	else
-		initStringInfo(&output_message);
-
-	pq_sendbyte(&output_message, 'w');
-	pq_sendint64(&output_message, startptr);	/* dataStart */
-	pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
-	pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
-
-	/*
-	 * Read the log directly into the output buffer to avoid extra memcpy
-	 * calls.
-	 */
-	enlargeStringInfo(&output_message, nbytes);
+	{
+		/*
+		* OK to read and send the slice.
+		*/
+		if (output_message.data)
+			resetStringInfo(&output_message);
+		else
+			initStringInfo(&output_message);
 
-retry:
-	if (!WALRead(xlogreader,
-				 &output_message.data[output_message.len],
-				 startptr,
-				 nbytes,
-				 xlogreader->seg.ws_tli,	/* Pass the current TLI because
-											 * only WalSndSegmentOpen controls
-											 * whether new TLI is needed. */
-				 &errinfo))
-		WALReadRaiseError(&errinfo);
+		pq_sendbyte(&output_message, 'w');
+		pq_sendint64(&output_message, startptr);	/* dataStart */
+		pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
+		pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
 
-	/* See logical_read_xlog_page(). */
-	XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
-	CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
+		/*
+		* Read the log directly into the output buffer to avoid extra memcpy
+		* calls.
+		*/
+		enlargeStringInfo(&output_message, nbytes);
+
+	retry:
+		if (!WALRead(xlogreader,
+					&output_message.data[output_message.len],
+					startptr,
+					nbytes,
+					xlogreader->seg.ws_tli,	/* Pass the current TLI because
+												* only WalSndSegmentOpen controls
+												* whether new TLI is needed. */
+					&errinfo))
+			WALReadRaiseError(&errinfo);
+
+		/* See logical_read_xlog_page(). */
+		XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
+		CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
 
-	/*
-	 * During recovery, the currently-open WAL file might be replaced with the
-	 * file of the same name retrieved from archive. So we always need to
-	 * check what we read was valid after reading into the buffer. If it's
-	 * invalid, we try to open and read the file again.
-	 */
-	if (am_cascading_walsender)
-	{
-		WalSnd	   *walsnd = MyWalSnd;
-		bool		reload;
+		/*
+		* During recovery, the currently-open WAL file might be replaced with the
+		* file of the same name retrieved from archive. So we always need to
+		* check what we read was valid after reading into the buffer. If it's
+		* invalid, we try to open and read the file again.
+		*/
+		if (am_cascading_walsender)
+		{
+			WalSnd	   *walsnd = MyWalSnd;
+			bool		reload;
 
-		SpinLockAcquire(&walsnd->mutex);
-		reload = walsnd->needreload;
-		walsnd->needreload = false;
-		SpinLockRelease(&walsnd->mutex);
+			SpinLockAcquire(&walsnd->mutex);
+			reload = walsnd->needreload;
+			walsnd->needreload = false;
+			SpinLockRelease(&walsnd->mutex);
 
-		if (reload && xlogreader->seg.ws_file >= 0)
-		{
-			wal_segment_close(xlogreader);
+			if (reload && xlogreader->seg.ws_file >= 0)
+			{
+				wal_segment_close(xlogreader);
 
-			goto retry;
+				goto retry;
+			}
 		}
-	}
 
-	output_message.len += nbytes;
-	output_message.data[output_message.len] = '\0';
+		output_message.len += nbytes;
+		output_message.data[output_message.len] = '\0';
 
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, output_message.data, output_message.len);
-	}
-	else
-	{
 		/*
 		 * Fill the send timestamp last, so that it is taken as late as possible.
 		 */
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 159af4f4bdc..538dcf6c5b6 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -14,6 +14,7 @@
 #define SK_PROTOCOL_VERSION   1
 
 #define MAX_SAFEKEEPERS        32
+#define MAX_SEND_SIZE         (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
 #define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
 #define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
@@ -251,23 +252,6 @@ typedef struct AppendRequestHeader
     pg_uuid_t  proposerId; /* for monitoring/debugging */
 } AppendRequestHeader;
 
-/*
- * All copy data message ('w') are linked in L1 send list and asynchronously sent to receivers.
- * When message is sent to all receivers, it is removed from send list.
- */
-struct WalMessage
-{
-	WalMessage* next;      /* L1 list of messages */
-	uint32 size;           /* message size */
-	AppendRequestHeader req; /* request to safekeeper (message header) */
-
-	/* PHANTOM FIELD:
-	 *
-	 * All WalMessages are allocated with exactly (size - sizeof(AppendRequestHeader)) additional bytes
-	 * after them, containing the body of the message. This allocation is done in `CreateMessage`
-	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
-};
-
 /*
  * Hot standby feedback received from replica
  */
@@ -342,20 +326,29 @@ typedef struct Safekeeper
 	 * reach SS_ACTIVE; not before.
 	 */
 	WalProposerConn*   conn;
+	/*
+	 * Temporary buffer for the message being sent to the safekeeper.
+	 */
 	StringInfoData outbuf;
+	/*
+	 * WAL reader, allocated for each safekeeper.
+	 */
+	XLogReaderState* xlogreader;
 
-	bool               flushWrite;    /* set to true if we need to call AsyncFlush, to flush pending messages */
-	WalMessage*        currMsg;       /* message that wasn't sent yet or NULL, if we have nothing to send */
-
-	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
-	SafekeeperState     state;         /* safekeeper state machine state */
-	AcceptorGreeting   greetResponse;         /* acceptor greeting  */
-	VoteResponse	   voteResponse;  /* the vote */
-	AppendResponse appendResponse;		  /* feedback to master */
 	/*
 	 * Streaming will start here; must be record boundary.
 	 */
 	XLogRecPtr startStreamingAt;
+
+	bool                flushWrite;     /* set to true if we need to call AsyncFlush, to flush pending messages */
+	XLogRecPtr          streamingAt;    /* current streaming position */
+	AppendRequestHeader appendRequest;  /* request for sending to safekeeper */
+
+	int                 eventPos;       /* position in wait event set. Equal to -1 if no event */
+	SafekeeperState     state;          /* safekeeper state machine state */
+	AcceptorGreeting    greetResponse;  /* acceptor greeting */
+	VoteResponse        voteResponse;   /* the vote */
+	AppendResponse      appendResponse; /* feedback for master */
 } Safekeeper;
 
 
@@ -365,19 +358,22 @@ void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
 uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
-void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
+void       WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
 bool       HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32     pq_getmsgint32_le(StringInfo msg);
 uint64     pq_getmsgint64_le(StringInfo msg);
-void	   pq_sendint32_le(StringInfo buf, uint32 i);
-void	   pq_sendint64_le(StringInfo buf, uint64 i);
+void       pq_sendint32_le(StringInfo buf, uint32 i);
+void       pq_sendint64_le(StringInfo buf, uint64 i);
 void       WalProposerPoll(void);
 void       WalProposerRegister(void);
+void       XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
+void       XLogWalPropClose(XLogRecPtr recptr);
 void       ProcessStandbyReply(XLogRecPtr	writePtr,
 							   XLogRecPtr	flushPtr,
 							   XLogRecPtr	applyPtr,
 							   TimestampTz replyTime,
 							   bool		replyRequested);
+void       PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
 void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									TransactionId feedbackXmin,
 									uint32		feedbackEpoch,

From 5bf4e687155c33a798db6f6c515850a3555c99dd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 27 Jan 2022 16:51:17 +0300
Subject: [PATCH 120/214] Add --sysid parameter to initdb

---
 src/backend/access/transam/xlog.c | 15 +++++++++++----
 src/backend/bootstrap/bootstrap.c | 13 ++++++++++++-
 src/bin/initdb/initdb.c           |  4 ++++
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 18d32ff5b2c..73dd459a318 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -112,6 +112,7 @@ int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 bool		track_wal_io_timing = false;
+uint64      predefined_sysidentifier;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -5323,10 +5324,16 @@ BootStrapXLOG(void)
 	 * perhaps be useful sometimes.
 	 */
 	gettimeofday(&tv, NULL);
-	sysidentifier = ((uint64) tv.tv_sec) << 32;
-	sysidentifier |= ((uint64) tv.tv_usec) << 12;
-	sysidentifier |= getpid() & 0xFFF;
-
+	if (predefined_sysidentifier != 0)
+	{
+		sysidentifier = predefined_sysidentifier;
+	}
+	else
+	{
+		sysidentifier = ((uint64) tv.tv_sec) << 32;
+		sysidentifier |= ((uint64) tv.tv_usec) << 12;
+		sysidentifier |= getpid() & 0xFFF;
+	}
 	/* First timeline ID is always 1 */
 	ThisTimeLineID = 1;
 
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 94ab5ca0954..97546f34e9a 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -53,6 +53,7 @@
 
 uint32		bootstrap_data_checksum_version = 0;	/* No checksum */
 
+extern uint64 predefined_sysidentifier;
 
 static void CheckerModeMain(void);
 static void BootstrapModeMain(void);
@@ -225,7 +226,7 @@ AuxiliaryProcessMain(int argc, char *argv[])
 	/* If no -x argument, we are a CheckerProcess */
 	MyAuxProcType = CheckerProcess;
 
-	while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:X:-:")) != -1)
+	while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:x:X:-:")) != -1)
 	{
 		switch (flag)
 		{
@@ -272,6 +273,16 @@ AuxiliaryProcessMain(int argc, char *argv[])
 									PGC_S_OVERRIDE);
 				}
 				break;
+			case 's':
+			{
+				char* endptr;
+#ifdef HAVE_STRTOULL
+				predefined_sysidentifier = strtoull(optarg, &endptr, 10);
+#else
+				predefined_sysidentifier = strtoul(optarg, &endptr, 10);
+#endif
+				break;
+			}
 			case 'c':
 			case '-':
 				{
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 77e621a7679..6e09e22062c 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2948,6 +2948,7 @@ main(int argc, char *argv[])
 		{"data-checksums", no_argument, NULL, 'k'},
 		{"allow-group-access", no_argument, NULL, 'g'},
 		{"discard-caches", no_argument, NULL, 14},
+		{"sysid", required_argument, NULL, 15},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -3094,6 +3095,9 @@ main(int argc, char *argv[])
 										 extra_options,
 										 "-c debug_discard_caches=1");
 				break;
+			case 15:
+				boot_options = psprintf("%s -s %s", boot_options, optarg);
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),

From 88cfd4ae646eba48fc6514bb0243e9bb3a490ce7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Apr 2022 18:11:05 +0400
Subject: [PATCH 121/214] Give up connection attempt to safekeeper after
 timeout.

Enforces reconnection soon when packets are dropped, e.g. after turning ec2
instance off.

ref https://github.com/neondatabase/neon/issues/1491
---
 src/backend/replication/walproposer.c | 27 +++++++++++++++++++++++++--
 src/backend/utils/misc/guc.c          | 11 +++++++++++
 src/include/replication/walproposer.h |  2 ++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4843b10e1d9..d9d44201242 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -61,6 +61,7 @@
 
 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
+int			wal_acceptor_connect_timeout;
 bool		am_wal_proposer;
 
 char	   *zenith_timeline_walproposer = NULL;
@@ -313,6 +314,8 @@ WalProposerPoll(void)
 		}
 		if (rc == 0) /* timeout expired: poll state */
 		{
+			TimestampTz now;
+
 			/*
 			 * If no WAL was generated during timeout (and we have already
 			 * collected the quorum), then send pool message
@@ -321,6 +324,25 @@ WalProposerPoll(void)
 			{
 				BroadcastAppendRequest();
 			}
+
+			/*
+			 * Abandon connection attempts which take too long.
+			 */
+			now = GetCurrentTimestamp();
+			for (int i = 0; i < n_safekeepers; i++)
+			{
+				Safekeeper  *sk = &safekeeper[i];
+
+				if ((sk->state == SS_CONNECTING_WRITE ||
+				     sk->state == SS_CONNECTING_READ) &&
+					TimestampDifferenceExceeds(sk->startedConnAt, now,
+										   	   wal_acceptor_connect_timeout))
+				{
+					elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
+						 sk->host, sk->port, wal_acceptor_connect_timeout);
+					ShutdownConnection(sk);
+				}
+			}
 		}
 	}
 }
@@ -622,9 +644,10 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	elog(LOG, "Connecting with node %s:%s", sk->host, sk->port);
+	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
 
 	sk->state = SS_CONNECTING_WRITE;
+	sk->startedConnAt = GetCurrentTimestamp();
 
 	sock = walprop_socket(sk->conn);
 	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
@@ -803,7 +826,7 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;
 
 		case WP_CONN_POLLING_FAILED:
-			elog(WARNING, "Failed to connect to node '%s:%s': %s",
+			elog(WARNING, "failed to connect to node '%s:%s': %s",
 					sk->host, sk->port, walprop_error_message(sk->conn));
 
 			/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index db0bf7ed967..7d449ba9fa2 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2321,6 +2321,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"wal_acceptor_connect_timeout", PGC_SIGHUP, REPLICATION_STANDBY,
+			gettext_noop("Timeout after which give up connection attempt to safekeeper."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&wal_acceptor_connect_timeout,
+		5000, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the maximum number of concurrent connections."),
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 538dcf6c5b6..19361eeaffc 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -27,6 +27,7 @@
 
 extern char* wal_acceptors_list;
 extern int   wal_acceptor_reconnect_timeout;
+extern int   wal_acceptor_connect_timeout;
 extern bool  am_wal_proposer;
 
 struct WalProposerConn; /* Defined in libpqwalproposer */
@@ -346,6 +347,7 @@ typedef struct Safekeeper
 
 	int                 eventPos;       /* position in wait event set. Equal to -1 if no event */
 	SafekeeperState     state;          /* safekeeper state machine state */
+	TimestampTz         startedConnAt;  /* when connection attempt started */
 	AcceptorGreeting    greetResponse;  /* acceptor greeting */
 	VoteResponse        voteResponse;   /* the vote */
 	AppendResponse      appendResponse; /* feedback for master */

From 5c8fa2cd2ae0fdbd26958877c7f5aca74b05476c Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 23 Apr 2022 08:52:15 +0300
Subject: [PATCH 122/214] Avoid redundand memory allocation and sycnhronization
 in walredo (#144)

* Avoid redundand memory allocation and sycnhronization in walredo

* Address review comments

* Reduce number of temp buffers and size of inmem file storage for wal redo postgres

* Misc cleanup

Add comments on 'inmem_smgr.c', remove superfluous copy-pasted comments,
pgindent.

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 contrib/zenith/inmem_smgr.c         | 188 +++++++++++-----------------
 src/backend/storage/buffer/bufmgr.c |   7 +-
 src/backend/tcop/zenith_wal_redo.c  |  78 +++++-------
 src/include/miscadmin.h             |   3 +
 4 files changed, 112 insertions(+), 164 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index 6ad1e65b04a..bdd58731f3c 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -2,36 +2,52 @@
  *
  * inmem_smgr.c
  *
+ * This is an implementation of the SMGR interface, used in the WAL redo
+ * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
+ * storage, the pages that are written out are kept in a small number of
+ * in-memory buffers.
+ *
+ * Normally, replaying a WAL record only needs to access a handful of
+ * buffers, which fit in the normal buffer cache, so this is just for
+ * "overflow" storage when the buffer cache is not large enough.
+ *
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  contrib/zenith/inmem_smgr.c
- *
- * TODO cleanup obsolete copy-pasted comments
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+
+#include "pagestore_client.h"
 #include "storage/block.h"
+#include "storage/buf_internals.h"
 #include "storage/relfilenode.h"
-#include "pagestore_client.h"
-#include "utils/hsearch.h"
-#include "access/xlog.h"
+#include "storage/smgr.h"
 
-typedef struct
-{
-	RelFileNode node;
-	ForkNumber	forknum;
-	BlockNumber blkno;
-}			WrNodeKey;
+#define MAX_PAGES 128
 
-typedef struct
-{
-	WrNodeKey	tag;
-	char		data[BLCKSZ];
-}			WrNode;
+static BufferTag page_tag[MAX_PAGES];
+static char page_body[MAX_PAGES][BLCKSZ];
+static int	used_pages;
 
-HTAB	   *inmem_files;
+static int
+locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
+{
+	/* We only hold a small number of pages, so linear search */
+	for (int i = 0; i < used_pages; i++)
+	{
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum
+			&& blkno == page_tag[i].blockNum)
+		{
+			return i;
+		}
+	}
+	return -1;
+}
 
 /*
  *	inmem_init() -- Initialize private state
@@ -39,18 +55,7 @@ HTAB	   *inmem_files;
 void
 inmem_init(void)
 {
-	HASHCTL		hashCtl;
-
-	hashCtl.keysize = sizeof(WrNodeKey);
-	hashCtl.entrysize = sizeof(WrNode);
-
-	if (inmem_files)
-		hash_destroy(inmem_files);
-
-	inmem_files = hash_create("wal-redo files map",
-							  1024,
-							  &hashCtl,
-							  HASH_ELEM | HASH_BLOBS);
+	used_pages = 0;
 }
 
 /*
@@ -59,15 +64,15 @@ inmem_init(void)
 bool
 inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
-	WrNodeKey	key;
-
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = 0;
-	return hash_search(inmem_files,
-					   &key,
-					   HASH_FIND,
-					   NULL) != NULL;
+	for (int i = 0; i < used_pages; i++)
+	{
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum)
+		{
+			return true;
+		}
+	}
+	return false;
 }
 
 /*
@@ -82,21 +87,6 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
 /*
  *	inmem_unlink() -- Unlink a relation.
- *
- * Note that we're passed a RelFileNodeBackend --- by the time this is called,
- * there won't be an SMgrRelation hashtable entry anymore.
- *
- * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
- * to delete all forks.
- *
- *
- * If isRedo is true, it's unsurprising for the relation to be already gone.
- * Also, we should remove the file immediately instead of queuing a request
- * for later, since during redo there's no possibility of creating a
- * conflicting relation.
- *
- * Note: any failure should be reported as WARNING not ERROR, because
- * we are usually not in a transaction anymore when this is called.
  */
 void
 inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
@@ -116,17 +106,8 @@ void
 inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 char *buffer, bool skipFsync)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
-
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = blkno;
-	node = hash_search(inmem_files,
-					   &key,
-					   HASH_ENTER,
-					   NULL);
-	memcpy(node->data, buffer, BLCKSZ);
+	/* same as smgwrite() for us */
+	inmem_write(reln, forknum, blkno, buffer, skipFsync);
 }
 
 /*
@@ -156,9 +137,6 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 /*
  * inmem_writeback() -- Tell the kernel to write pages back to storage.
- *
- * This accepts a range of blocks because flushing several pages at once is
- * considerably more efficient than doing so individually.
  */
 void
 inmem_writeback(SMgrRelation reln, ForkNumber forknum,
@@ -173,20 +151,13 @@ void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   char *buffer)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
+	int			pg;
 
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = blkno;
-	node = hash_search(inmem_files,
-					   &key,
-					   HASH_FIND,
-					   NULL);
-	if (node != NULL)
-		memcpy(buffer, node->data, BLCKSZ);
-	else
+	pg = locate_page(reln, forknum, blkno);
+	if (pg < 0)
 		memset(buffer, 0, BLCKSZ);
+	else
+		memcpy(buffer, page_body[pg], BLCKSZ);
 }
 
 /*
@@ -200,17 +171,19 @@ void
 inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			char *buffer, bool skipFsync)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
+	int			pg;
+
+	pg = locate_page(reln, forknum, blocknum);
+	if (pg < 0)
+	{
+		if (used_pages == MAX_PAGES)
+			elog(ERROR, "Inmem storage overflow");
 
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = blocknum;
-	node = hash_search(inmem_files,
-					   &key,
-					   HASH_ENTER,
-					   NULL);
-	memcpy(node->data, buffer, BLCKSZ);
+		pg = used_pages;
+		used_pages++;
+		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
+	}
+	memcpy(page_body[pg], buffer, BLCKSZ);
 }
 
 /*
@@ -219,23 +192,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
-
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = 0;
+	int			nblocks = 0;
 
-	while (true)
+	for (int i = 0; i < used_pages; i++)
 	{
-		node = hash_search(inmem_files,
-						   &key,
-						   HASH_FIND,
-						   NULL);
-		if (node == NULL)
-			return key.blkno;
-		key.blkno += 1;
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum)
+		{
+			if (page_tag[i].blockNum >= nblocks)
+				nblocks = page_tag[i].blockNum + 1;
+		}
 	}
+	return nblocks;
 }
 
 /*
@@ -248,19 +216,12 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 /*
  *	inmem_immedsync() -- Immediately sync a relation to stable storage.
- *
- * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.  We
- * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
- * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
- * some segment, then mdtruncate() renders that segment inactive.  If we
- * crash before the next checkpoint syncs the newly-inactive segment, that
- * segment may survive recovery, reintroducing unwanted data into the table.
  */
 void
 inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
+
 static const struct f_smgr inmem_smgr =
 {
 	.smgr_init = inmem_init,
@@ -283,12 +244,11 @@ static const struct f_smgr inmem_smgr =
 const f_smgr *
 smgr_inmem(BackendId backend, RelFileNode rnode)
 {
-	if (backend != InvalidBackendId && !InRecovery)
+	Assert(InRecovery);
+	if (backend != InvalidBackendId)
 		return smgr_standard(backend, rnode);
 	else
-	{
 		return &inmem_smgr;
-	}
 }
 
 void
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b96e033e53c..27eb4f28ca5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -804,7 +804,6 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
-
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
  *
@@ -819,7 +818,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	Block		bufBlock;
 	bool		found;
 	bool		isExtend;
-	bool		isLocalBuf = SmgrIsTemp(smgr);
+	/*
+	 * wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer, 
+	 * so let's use local buffers instead
+	 */
+	bool		isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres;
 
 	*hit = false;
 
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 0ddd2ddec24..16298ea7f4f 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -99,6 +99,10 @@ static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
+bool		am_wal_redo_postgres;
+
+static XLogReaderState *reader_state;
+
 #define TRACE DEBUG5
 
 #ifdef HAVE_LIBSECCOMP
@@ -166,12 +170,20 @@ WalRedoMain(int argc, char *argv[],
 	InitStandaloneProcess(argv[0]);
 
 	SetProcessingMode(InitProcessing);
+	am_wal_redo_postgres = true;
 
 	/*
 	 * Set default values for command-line options.
 	 */
 	InitializeGUCOptions();
 
+	/*
+	 * WAL redo does not need a large number of buffers. And speed of
+	 * DropRelFileNodeAllLocalBuffers() is proportional to the number of
+	 * buffers. So let's keep it small (default value is 1024)
+	 */
+	num_temp_buffers = 4;
+
 	/*
 	 * Parse command-line options.
 	 * TODO
@@ -293,6 +305,7 @@ WalRedoMain(int argc, char *argv[],
 		if (RmgrTable[rmid].rm_startup != NULL)
 			RmgrTable[rmid].rm_startup();
 	}
+	reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL);
 
 #ifdef HAVE_LIBSECCOMP
 	/* We prefer opt-out to opt-in for greater security */
@@ -313,16 +326,13 @@ WalRedoMain(int argc, char *argv[],
 	/*
 	 * Main processing loop
 	 */
+	MemoryContextSwitchTo(MessageContext);
+	initStringInfo(&input_message);
+
 	for (;;)
 	{
-		/*
-		 * Release storage left over from prior query cycle, and create a new
-		 * query input buffer in the cleared MessageContext.
-		 */
-		MemoryContextSwitchTo(MessageContext);
-		MemoryContextResetAndDeleteChildren(MessageContext);
-
-		initStringInfo(&input_message);
+		/* Release memory left over from prior query cycle. */
+		resetStringInfo(&input_message);
 
 		set_ps_display("idle");
 
@@ -330,7 +340,6 @@ WalRedoMain(int argc, char *argv[],
 		 * (3) read a command (loop blocks here)
 		 */
 		firstchar = ReadRedoCommand(&input_message);
-
 		switch (firstchar)
 		{
 			case 'B':			/* BeginRedoForBlock */
@@ -406,23 +415,6 @@ pprint_buffer(char *data, int len)
 	return s.data;
 }
 
-static char *
-pprint_tag(BufferTag *tag)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	appendStringInfo(&s, "%u/%u/%u.%d blk %u",
-		tag->rnode.spcNode,
-		tag->rnode.dbNode,
-		tag->rnode.relNode,
-		tag->forkNum,
-		tag->blockNum
-	);
-
-	return s.data;
-}
 /* ----------------------------------------------------------------
  *		routines to obtain user input
  * ----------------------------------------------------------------
@@ -492,7 +484,6 @@ ReadRedoCommand(StringInfo inBuf)
 	return qtype;
 }
 
-
 /*
  * Prepare for WAL replay on given block
  */
@@ -502,7 +493,6 @@ BeginRedoForBlock(StringInfo input_message)
 	RelFileNode rnode;
 	ForkNumber forknum;
 	BlockNumber blknum;
-	MemoryContext oldcxt;
 	SMgrRelation reln;
 
 	/*
@@ -520,16 +510,14 @@ BeginRedoForBlock(StringInfo input_message)
 	rnode.relNode = pq_getmsgint(input_message, 4);
 	blknum = pq_getmsgint(input_message, 4);
 
-	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
 	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
 
-	{
-		char* buf = pprint_tag(&target_redo_tag);
-		elog(TRACE, "BeginRedoForBlock %s", buf);
-		pfree(buf);
-	}
-
-	MemoryContextSwitchTo(oldcxt);
+	elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
+		 target_redo_tag.rnode.spcNode,
+		 target_redo_tag.rnode.dbNode,
+		 target_redo_tag.rnode.relNode,
+		 target_redo_tag.forkNum,
+		 target_redo_tag.blockNum);
 
 	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
@@ -589,7 +577,6 @@ ApplyRecord(StringInfo input_message)
 	XLogRecPtr	lsn;
 	XLogRecord *record;
 	int			nleft;
-	XLogReaderState reader_state;
 
 	/*
 	 * message format:
@@ -607,20 +594,15 @@ ApplyRecord(StringInfo input_message)
 		elog(ERROR, "mismatch between record (%d) and message size (%d)",
 			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
 
-	/* FIXME: use XLogReaderAllocate() */
-	memset(&reader_state, 0, sizeof(XLogReaderState));
-	reader_state.ReadRecPtr = 0; /* no 'prev' record */
-	reader_state.EndRecPtr = lsn; /* this record */
-	reader_state.decoded_record = record;
-	reader_state.errormsg_buf = palloc(1000 + 1); /* MAX_ERRORMSG_LEN */
-
-	if (!DecodeXLogRecord(&reader_state, record, &errormsg))
+	XLogBeginRead(reader_state, lsn);
+	reader_state->decoded_record = record;
+	if (!DecodeXLogRecord(reader_state, record, &errormsg))
 		elog(ERROR, "failed to decode WAL record: %s", errormsg);
 
 	/* Ignore any other blocks than the ones the caller is interested in */
 	redo_read_buffer_filter = redo_block_filter;
 
-	RmgrTable[record->xl_rmid].rm_redo(&reader_state);
+	RmgrTable[record->xl_rmid].rm_redo(reader_state);
 
 	redo_read_buffer_filter = NULL;
 
@@ -701,8 +683,8 @@ GetPage(StringInfo input_message)
 	} while (tot_written < BLCKSZ);
 
 	ReleaseBuffer(buf);
-	DropDatabaseBuffers(rnode.dbNode);
-	smgrinit(); //reset inmem smgr state
+	DropRelFileNodeAllLocalBuffers(rnode);
+	smgrinit();					/* reset inmem smgr state */
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 3f155ce4f84..72bd0a7ebd4 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -489,4 +489,7 @@ extern void CancelBackup(void);
 extern size_t get_hash_memory_limit(void);
 extern int	get_hash_mem(void);
 
+/* in src/backend/tcop/zenith_wal_redo.c */
+extern bool am_wal_redo_postgres;
+
 #endif							/* MISCADMIN_H */

From 2117482b920933c27e61c3e15bc4ac4cb6cb84cc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 25 Apr 2022 19:54:53 +0300
Subject: [PATCH 123/214] Fix missed include for InRecovery (#149)

* Fix missed include for InRecovery

* Fix missed include for InRecovery (used only in debug version with --enable--cassert)
---
 contrib/zenith/inmem_smgr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index bdd58731f3c..95e7d062f61 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -21,6 +21,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pagestore_client.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"

From 5fc7d18370650955386be812f90e6e8f2199f8ea Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 09:44:00 +0300
Subject: [PATCH 124/214] Avoid "bad syscall 39" on assertion failure in WAL
 redo process.

ExceptionalCondition calls getpid(), which is currently forbidden by
seccomp. You only get there if something else went wrong, but the "bad
syscall" error hides the underlying cause of the error, which makes
debugging hard.
---
 src/backend/tcop/zenith_wal_redo.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 16298ea7f4f..f09ae5273b2 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -125,6 +125,13 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(mmap),
 		PG_SCMP_ALLOW(munmap),
 #endif
+		/*
+		 * getpid() is called on assertion failure, in ExceptionalCondition.
+		 * It's not really needed, but seems pointless to hide it either. The
+		 * system call unlikely to expose a kernel vulnerability, and the PID
+		 * is stored in MyProcPid anyway.
+		 */
+		PG_SCMP_ALLOW(getpid),
 
 		/* Enable those for a proper shutdown.
 		PG_SCMP_ALLOW(munmap),

From e3d732f0ac27075b62d2faa46515d3f3b9691f8a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 12:17:45 +0300
Subject: [PATCH 125/214] Add error context, if replaying a WAL record fails in
 WAL redo process.

---
 src/backend/access/transam/xlog.c  |  3 +--
 src/backend/tcop/zenith_wal_redo.c | 37 +++++++++++++++++++++++++++++-
 src/include/access/xlog.h          |  1 +
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 73dd459a318..d16ffca466c 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -977,7 +977,6 @@ static bool CheckForStandbyTrigger(void);
 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 #endif
 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
-static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 static void pg_start_backup_callback(int code, Datum arg);
 static void pg_stop_backup_callback(int code, Datum arg);
 static bool read_backup_label(XLogRecPtr *checkPointLoc,
@@ -10989,7 +10988,7 @@ xlog_block_info(StringInfo buf, XLogReaderState *record)
  * Returns a string describing an XLogRecord, consisting of its identity
  * optionally followed by a colon, a space, and a further description.
  */
-static void
+void
 xlog_outdesc(StringInfo buf, XLogReaderState *record)
 {
 	RmgrId		rmid = XLogRecGetRmid(record);
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index f09ae5273b2..ac55c7cefc6 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -93,6 +93,7 @@ static int	ReadRedoCommand(StringInfo inBuf);
 static void BeginRedoForBlock(StringInfo input_message);
 static void PushPage(StringInfo input_message);
 static void ApplyRecord(StringInfo input_message);
+static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
@@ -579,11 +580,11 @@ PushPage(StringInfo input_message)
 static void
 ApplyRecord(StringInfo input_message)
 {
-	/* recovery here */
 	char	   *errormsg;
 	XLogRecPtr	lsn;
 	XLogRecord *record;
 	int			nleft;
+	ErrorContextCallback errcallback;
 
 	/*
 	 * message format:
@@ -601,7 +602,18 @@ ApplyRecord(StringInfo input_message)
 		elog(ERROR, "mismatch between record (%d) and message size (%d)",
 			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
 
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = apply_error_callback;
+	errcallback.arg = (void *) reader_state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	XLogBeginRead(reader_state, lsn);
+	/*
+	 * In lieu of calling XLogReadRecord, store the record 'decoded_record'
+	 * buffer directly.
+	 */
+	reader_state->ReadRecPtr = lsn;
 	reader_state->decoded_record = record;
 	if (!DecodeXLogRecord(reader_state, record, &errormsg))
 		elog(ERROR, "failed to decode WAL record: %s", errormsg);
@@ -613,10 +625,33 @@ ApplyRecord(StringInfo input_message)
 
 	redo_read_buffer_filter = NULL;
 
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+
 	elog(TRACE, "applied WAL record with LSN %X/%X",
 		 (uint32) (lsn >> 32), (uint32) lsn);
 }
 
+/*
+ * Error context callback for errors occurring during ApplyRecord
+ */
+static void
+apply_error_callback(void *arg)
+{
+	XLogReaderState *record = (XLogReaderState *) arg;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+	xlog_outdesc(&buf, record);
+
+	/* translator: %s is a WAL record description */
+	errcontext("WAL redo at %X/%X for %s",
+			   LSN_FORMAT_ARGS(record->ReadRecPtr),
+			   buf.data);
+
+	pfree(buf.data);
+}
+
 static bool
 redo_block_filter(XLogReaderState *record, uint8 block_id)
 {
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index e34f1deaf6e..f35e3686cf8 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -307,6 +307,7 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
 extern void xlog_redo(XLogReaderState *record);
 extern void xlog_desc(StringInfo buf, XLogReaderState *record);
 extern const char *xlog_identify(uint8 info);
+extern void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 
 extern void issue_xlog_fsync(int fd, XLogSegNo segno);
 

From 6a546dbc3acaa25d8711131a555ae35db49519ad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 13:19:01 +0300
Subject: [PATCH 126/214] Add WARNING for debugging purposes.

---
 contrib/zenith/inmem_smgr.c        | 4 ++++
 src/backend/tcop/zenith_wal_redo.c | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index 95e7d062f61..dbc780624a8 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -195,6 +195,10 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	int			nblocks = 0;
 
+	/*
+	 * Find the hightest-numbered page, and report that as the relation size.
+	 * XXX: Why does this get called during WAL replay at all?
+	 */
 	for (int i = 0; i < used_pages; i++)
 	{
 		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index ac55c7cefc6..aa423a0e2dc 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -664,6 +664,14 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
 		elog(PANIC, "failed to locate backup block with ID %d", block_id);
 	}
 
+	/*
+	 * Can a WAL redo function ever access a relation other than the one that
+	 * it modifies? I don't see why it would.
+	 */
+	if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
+		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
+			 target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
+
 	/*
 	 * If this block isn't one we are currently restoring, then return 'true'
 	 * so that this gets ignored

From 3170b4113f656caec65f07749e62be44901c765e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 14:20:49 +0300
Subject: [PATCH 127/214] Turn Assertion into elog(ERROR), to help with
 debugging.

This error is happening in the 'pg_regress' test in the CI, but not on
my laptop. Turn it into an ERROR, so that we get the error context and
backtrace of it.
---
 src/backend/storage/smgr/smgr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 10a6f65c118..f1e676bcc3e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -194,7 +194,11 @@ smgropen(RelFileNode rnode, BackendId backend, char relpersistence)
 		if (reln->smgr_relpersistence == 0)
 			reln->smgr_relpersistence = relpersistence;
 		else
-			Assert(relpersistence == 0 || reln->smgr_relpersistence == relpersistence);
+		{
+			if (!(relpersistence == 0 || reln->smgr_relpersistence == relpersistence))
+				elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c",
+					 relpersistence, reln->smgr_relpersistence);
+		}
 	}
 
 	return reln;

From 2e719a3ffbf1e6a8855d5fb7dacfbfb6c9d75fe2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 15:05:50 +0300
Subject: [PATCH 128/214] Fix errors in WAL redo about relpersistence mismatch.

In the WAL redo process, even "permanent" buffers are stored in the
local buffer cache. Need to pass RELPERSISTENCE_PERMANENT to smgropen()
in that case.
---
 src/backend/storage/buffer/localbuf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index b9811cc7327..3184b1e5686 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -18,6 +18,7 @@
 #include "access/parallel.h"
 #include "catalog/catalog.h"
 #include "executor/instrument.h"
+#include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "utils/guc.h"
@@ -215,7 +216,10 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 		/* Find smgr relation for buffer */
-		oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP);
+		if (am_wal_redo_postgres && MyBackendId == InvalidBackendId)
+			oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_PERMANENT);
+		else
+			oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP);
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 

From 7623ad94f254b121118ee978ba49f100cf58bae2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 26 Apr 2022 18:51:16 +0400
Subject: [PATCH 129/214] Don't log 'last written LSN ahead of flushed'.

That's a valid case, as edited comment says.

https://github.com/neondatabase/neon/issues/1303
---
 contrib/zenith/pagestore_smgr.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 18c55fa5cdc..caa77a59091 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -592,14 +592,16 @@ zenith_get_request_lsn(bool *latest)
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush
-		 * LSN? Probably not, we shouldn't evict a page from the buffer cache
+		 * LSN? Generally not, we shouldn't evict a page from the buffer cache
 		 * before all its modifications have been safely flushed. That's the
-		 * "WAL before data" rule. But better safe than sorry.
+		 * "WAL before data" rule. However, such case does exist at index building,
+		 * _bt_blwritepage logs the full page without flushing WAL before
+		 * smgrextend (files are fsynced before build ends).
 		 */
 		flushlsn = GetFlushRecPtr();
 		if (lsn > flushlsn)
 		{
-			elog(LOG, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+			elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
 				 (uint32) (lsn >> 32), (uint32) lsn,
 				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
 			XLogFlush(lsn);

From 026428ca2c0af113db9d5b1f87a1aaa664193a6a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 27 Apr 2022 17:36:20 +0300
Subject: [PATCH 130/214] Perform inmem_smgr cleaup after processing each
 record (#154)

* Perform inmem_smgr cleaup after processing each record

* Prevent eviction of wal redo target page

* Prevent eviction of wal redo target page frmo temp buffers
---
 contrib/zenith/inmem_smgr.c           | 6 +-----
 src/backend/storage/buffer/localbuf.c | 8 ++++++++
 src/backend/tcop/zenith_wal_redo.c    | 5 ++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index dbc780624a8..abc600f0b4a 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -28,7 +28,7 @@
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 
-#define MAX_PAGES 128
+#define MAX_PAGES 32
 
 static BufferTag page_tag[MAX_PAGES];
 static char page_body[MAX_PAGES][BLCKSZ];
@@ -195,10 +195,6 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	int			nblocks = 0;
 
-	/*
-	 * Find the hightest-numbered page, and report that as the relation size.
-	 * XXX: Why does this get called during WAL replay at all?
-	 */
 	for (int i = 0; i < used_pages; i++)
 	{
 		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3184b1e5686..f22ec0d82df 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -25,6 +25,8 @@
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
 
+/* ZENITH: prevent eviction of the buffer of target page */
+extern Buffer wal_redo_buffer;
 
 /*#define LBDEBUG*/
 
@@ -183,6 +185,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
+			if (-b - 1 == wal_redo_buffer)
+			{
+				/* ZENITH: Prevent eviction of the buffer with target wal redo page */
+				continue;
+			}
+
 			buf_state = pg_atomic_read_u32(&bufHdr->state);
 
 			if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index aa423a0e2dc..68f29564328 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -100,6 +100,7 @@ static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
+Buffer		wal_redo_buffer;
 bool		am_wal_redo_postgres;
 
 static XLogReaderState *reader_state;
@@ -566,6 +567,7 @@ PushPage(StringInfo input_message)
 	content = pq_getmsgbytes(input_message, BLCKSZ);
 
 	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL);
+	wal_redo_buffer = buf;
 	page = BufferGetPage(buf);
 	memcpy(page, content, BLCKSZ);
 	MarkBufferDirty(buf); /* pro forma */
@@ -594,6 +596,8 @@ ApplyRecord(StringInfo input_message)
 	 */
 	lsn = pq_getmsgint64(input_message);
 
+	smgrinit();					/* reset inmem smgr state */
+
 	/* note: the input must be aligned here */
 	record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
 
@@ -734,7 +738,6 @@ GetPage(StringInfo input_message)
 
 	ReleaseBuffer(buf);
 	DropRelFileNodeAllLocalBuffers(rnode);
-	smgrinit();					/* reset inmem smgr state */
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }

From 984248de72ece1f2d3fbc4140b1d45ed045db605 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 3 May 2022 13:03:19 +0300
Subject: [PATCH 131/214] Avoid extending relation in the WAL redo process.

It's a waste of time, and otherwise you can run into the MAX_PAGES limit.

Fixes https://github.com/neondatabase/neon/issues/1615
---
 contrib/zenith/inmem_smgr.c | 38 +++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index abc600f0b4a..1d8aa9ac2ee 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -177,12 +177,27 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	pg = locate_page(reln, forknum, blocknum);
 	if (pg < 0)
 	{
+		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum,
+			 blocknum,
+			 used_pages);
 		if (used_pages == MAX_PAGES)
 			elog(ERROR, "Inmem storage overflow");
 
 		pg = used_pages;
 		used_pages++;
 		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
+	}  else {
+		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum,
+			 blocknum,
+			 used_pages);
 	}
 	memcpy(page_body[pg], buffer, BLCKSZ);
 }
@@ -193,18 +208,17 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	int			nblocks = 0;
-
-	for (int i = 0; i < used_pages; i++)
-	{
-		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
-			&& forknum == page_tag[i].forkNum)
-		{
-			if (page_tag[i].blockNum >= nblocks)
-				nblocks = page_tag[i].blockNum + 1;
-		}
-	}
-	return nblocks;
+	/*
+	 * It's not clear why a WAL redo function would call smgrnblocks().
+	 * During recovery, at least before reaching consistency, the size of a
+	 * relation could be arbitrarily small, if it was truncated after the
+	 * record being replayed, or arbitrarily large if it was extended
+	 * afterwards. But one place where it's called is in
+	 * XLogReadBufferExtended(): it extends the relation, if it's smaller than
+	 * the requested page. That's a waste of time in the WAL redo
+	 * process. Pretend that all relations are maximally sized to avoid it.
+	 */
+	return MaxBlockNumber;
 }
 
 /*

From cd5729ddff3ad8559aaa43aea9f8d28773a975a3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sun, 1 May 2022 17:02:23 +0400
Subject: [PATCH 132/214] Send timeline_start_lsn in Elected and receive it in
 VoteResponse messages.

To support remembering it on safekeeper. Currently compute doesn't know initial
LSN on non-first boot (though it could get it from pageserver in theory), so we
rely on safekeepers to fetch it back.

While changing the protocol, also add node_id to AcceptorProposerGreeting.
---
 src/backend/replication/walproposer.c | 75 ++++++++++++++++++---------
 src/include/replication/walproposer.h |  9 +++-
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d9d44201242..21a538fd603 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -99,6 +99,7 @@ static TermHistory propTermHistory; /* term history of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
 static term_t donorEpoch;		/* Most advanced acceptor epoch */
 static int	donor;				/* Most advanced acceptor */
+static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
 static int	n_votes = 0;
 static int	n_connected = 0;
 static TimestampTz last_reconnect_attempt;
@@ -767,7 +768,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 			if (!AsyncFlush(sk))
 				return;
-			
+
 			/* flush is done, event set and state will be updated later */
 			StartStreaming(sk);
 			break;
@@ -977,7 +978,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	}
 	else if (sk->greetResponse.term > propTerm)
 	{
-		/* Another compute with higher term is running. */	
+		/* Another compute with higher term is running. */
 		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
 				sk->host, sk->port,
 				sk->greetResponse.term, propTerm);
@@ -1037,10 +1038,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	elog(LOG,
-			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
 			sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
 			LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
+			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+			LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only
@@ -1081,7 +1083,7 @@ RecvVoteResponse(Safekeeper *sk)
 /*
  * Called once a majority of acceptors have voted for us and current proposer
  * has been elected.
- * 
+ *
  * Sends ProposerElected message to all acceptors in SS_IDLE state and starts
  * replication from walsender.
  */
@@ -1118,7 +1120,7 @@ HandleElectedProposer(void)
 			SendProposerElected(&safekeeper[i]);
 	}
 
-	/* 
+	/*
 	 * The proposer has been elected, and there will be no quorum waiting
 	 * after this point. There will be no safekeeper with state SS_IDLE
 	 * also, because that state is used only for quorum waiting.
@@ -1173,6 +1175,7 @@ DetermineEpochStartLsn(void)
 	propEpochStartLsn = InvalidXLogRecPtr;
 	donorEpoch = 0;
 	truncateLsn = InvalidXLogRecPtr;
+	timelineStartLsn = InvalidXLogRecPtr;
 
 	for (int i = 0; i < n_safekeepers; i++)
 	{
@@ -1187,6 +1190,20 @@ DetermineEpochStartLsn(void)
 				donor = i;
 			}
 			truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn);
+
+			if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
+			{
+				/* timelineStartLsn should be the same everywhere or unknown */
+				if (timelineStartLsn != InvalidXLogRecPtr &&
+					timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn)
+				{
+					elog(WARNING,
+						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						 LSN_FORMAT_ARGS(timelineStartLsn),
+						 LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn));
+				}
+				timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn;
+			}
 		}
 	}
 
@@ -1194,12 +1211,16 @@ DetermineEpochStartLsn(void)
 	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
 	 * was committed yet. To keep the idea of always starting streaming since
 	 * record boundary (which simplifies decoding on safekeeper), take start
-	 * position of the slot.
+	 * position of the slot. TODO: take it from .signal file.
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
 		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
 		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		if (timelineStartLsn == InvalidXLogRecPtr)
+		{
+			timelineStartLsn = MyReplicationSlot->data.restart_lsn;
+		}
 		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
 	}
@@ -1332,7 +1353,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
  *    safekeeper is synced, being important for sync-safekeepers)
  * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
  *    beyond it -- and history of term switching.
- * 
+ *
  * Sets sk->startStreamingAt.
  */
 static void
@@ -1343,7 +1364,7 @@ SendProposerElected(Safekeeper *sk)
 	term_t lastCommonTerm;
 	int i;
 
-	/* 
+	/*
 	 * Determine start LSN by comparing safekeeper's log term switch history and
 	 * proposer's, searching for the divergence point.
 	 *
@@ -1352,7 +1373,7 @@ SendProposerElected(Safekeeper *sk)
 	 * wrote some WAL on single sk and died; we stream since the beginning then.
 	 */
 	th = &sk->voteResponse.termHistory;
-	/* 
+	/*
 	 * If any WAL is present on the sk, it must be authorized by some term.
 	 * OTOH, without any WAL there are no term swiches in the log.
 	 */
@@ -1382,11 +1403,11 @@ SendProposerElected(Safekeeper *sk)
 			 * that all safekeepers reported that they have persisted WAL up
 			 * to the truncateLsn before, but now current safekeeper tells
 			 * otherwise.
-			 * 
+			 *
 			 * Also we have a special condition here, which is empty safekeeper
 			 * with no history. In combination with a gap, that can happen when
 			 * we introduce a new safekeeper to the cluster. This is a rare case,
-			 * which is triggered manually for now, and should be treated with 
+			 * which is triggered manually for now, and should be treated with
 			 * care.
 			 */
 
@@ -1429,12 +1450,13 @@ SendProposerElected(Safekeeper *sk)
 	msg.term = propTerm;
 	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &propTermHistory;
+	msg.timelineStartLsn = timelineStartLsn;
 
 	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
 	elog(LOG,
-		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
-	
+		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+
 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
 	pq_sendint64_le(&sk->outbuf, msg.term);
@@ -1445,6 +1467,7 @@ SendProposerElected(Safekeeper *sk)
 		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
 		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
 	}
+	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
 
 	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;
@@ -1475,7 +1498,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 StartStreaming(Safekeeper *sk)
 {
-	/* 
+	/*
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
 	 * exactly once for a connection.
 	 */
@@ -1546,7 +1569,7 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	/*
 	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
 	 * in the buffer.
-	 * 
+	 *
 	 * LSN comparison checks if we have pending unsent messages. This check isn't
 	 * necessary now, because we always send append messages immediately after
 	 * arrival. But it's good to have it here in case we change this behavior
@@ -1561,9 +1584,9 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 /*
  * Send WAL messages starting from sk->streamingAt until the end or non-writable
  * socket, whichever comes first. Caller should take care of updating event set.
- * Even if no unsent WAL is available, at least one empty message will be sent 
+ * Even if no unsent WAL is available, at least one empty message will be sent
  * as a heartbeat, if socket is ready.
- * 
+ *
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
  */
@@ -1579,7 +1602,7 @@ SendAppendRequests(Safekeeper *sk)
 	if (sk->flushWrite)
 	{
 		if (!AsyncFlush(sk))
-			/* 
+			/*
 			 * AsyncFlush failed, that could happen if the socket is closed or
 			 * we have nothing to write and should wait for writeable socket.
 			 */
@@ -1631,7 +1654,7 @@ SendAppendRequests(Safekeeper *sk)
 		sk->outbuf.len += req->endLsn - req->beginLsn;
 
 		writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
-		
+
 		/* Mark current message as sent, whatever the result is */
 		sk->streamingAt = endLsn;
 
@@ -1669,7 +1692,7 @@ SendAppendRequests(Safekeeper *sk)
  *
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
- * 
+ *
  * NB: This function can call SendMessageToNode and produce new messages.
  */
 static bool
@@ -1988,7 +2011,7 @@ HandleSafekeeperResponse(void)
 
 	/*
 	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the 
+	 * flushed to all safekeepers. We must always start streaming from the
 	 * beginning of the record, which simplifies decoding on the far end.
 	 *
 	 * Advanced truncateLsn should be not further than nearest commitLsn.
@@ -2051,7 +2074,7 @@ HandleSafekeeperResponse(void)
 	}
 }
 
-/* 
+/*
  * Try to read CopyData message from i'th safekeeper, resetting connection on
  * failure.
  */
@@ -2082,7 +2105,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
  * Read next message with known type into provided struct, by reading a CopyData
  * block from the safekeeper's postgres connection, returning whether the read
  * was successful.
- * 
+ *
  * If the read needs more polling, we return 'false' and keep the state
  * unmodified, waiting until it becomes read-ready to try again. If it fully
  * failed, a warning is emitted and the connection is reset.
@@ -2118,6 +2141,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 		{
 			AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
 			msg->term = pq_getmsgint64_le(&s);
+			msg->nodeId = pq_getmsgint64_le(&s);
 			pq_getmsgend(&s);
 			return true;
 		}
@@ -2137,6 +2161,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
 				msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
 			}
+			msg->timelineStartLsn = pq_getmsgint64_le(&s);
 			pq_getmsgend(&s);
 			return true;
 		}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 19361eeaffc..1fcaaa3fc11 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -11,7 +11,7 @@
 #include "replication/walreceiver.h"
 
 #define SK_MAGIC              0xCafeCeefu
-#define SK_PROTOCOL_VERSION   1
+#define SK_PROTOCOL_VERSION   2
 
 #define MAX_SAFEKEEPERS        32
 #define MAX_SEND_SIZE         (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
@@ -147,6 +147,9 @@ typedef enum
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
+/* neon storage node id */
+typedef uint64 NNodeId;
+
 /*
  * Proposer <-> Acceptor messaging.
  */
@@ -177,6 +180,7 @@ typedef struct AcceptorGreeting
 {
 	AcceptorProposerMessage apm;
 	term_t		term;
+	NNodeId		nodeId;
 } AcceptorGreeting;
 
 /*
@@ -214,6 +218,7 @@ typedef struct VoteResponse {
 	XLogRecPtr flushLsn;
 	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some safekeeper */
 	TermHistory termHistory;
+	XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
 } VoteResponse;
 
 /*
@@ -228,6 +233,8 @@ typedef struct ProposerElected
 	XLogRecPtr startStreamingAt;
 	/* history of term switches up to this proposer */
 	TermHistory *termHistory;
+	/* timeline globally starts at this LSN */
+	XLogRecPtr timelineStartLsn;
 } ProposerElected;
 
 /*

From db153c9747374ae1086781fc1cc80bae7291329d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 2 May 2022 19:32:27 +0400
Subject: [PATCH 133/214] Verify basebackup LSN against consensus LSN in
 walproposer.

If not, such basebackup (clog etc) is inconsistent and must be retaken.

Basebackup LSN is taken by exposing xlog.c RedoStartLSN in shmem.

ref https://github.com/neondatabase/neon/issues/594
---
 src/backend/access/transam/xlog.c     | 16 ++++++
 src/backend/replication/walproposer.c | 77 ++++++++++++++++++---------
 src/backend/storage/ipc/ipci.c        |  4 +-
 src/include/access/xlog.h             |  2 +
 src/include/replication/walproposer.h | 10 ++--
 5 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d16ffca466c..4cf4e43b75b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -751,6 +751,9 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 	XLogRecPtr  lastWrittenPageLSN;
 
+	/* neon: copy of startup's RedoStartLSN for walproposer's use */
+	XLogRecPtr	RedoStartLSN;
+
 	/*
 	 * size of a timeline in zenith pageserver.
 	 * used to enforce timeline size limit.
@@ -6913,6 +6916,8 @@ StartupXLOG(void)
 
 		checkPointLoc = zenithLastRec;
 		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		/* make basebackup LSN available for walproposer */
+		XLogCtl->RedoStartLSN = RedoStartLSN;
 		EndRecPtr = ControlFile->checkPointCopy.redo;
 
 		memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint));
@@ -6983,6 +6988,7 @@ StartupXLOG(void)
 		/* Get the last valid checkpoint record. */
 		checkPointLoc = ControlFile->checkPoint;
 		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		XLogCtl->RedoStartLSN = RedoStartLSN;
 		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
 		if (record != NULL)
 		{
@@ -8908,6 +8914,16 @@ SetLastWrittenPageLSN(XLogRecPtr lsn)
 	SpinLockRelease(&XLogCtl->info_lck);
 }
 
+/*
+ * RedoStartLsn is set only once by startup process, locking is not required
+ * after its exit.
+ */
+XLogRecPtr
+GetRedoStartLsn(void)
+{
+	return XLogCtl->RedoStartLSN;
+}
+
 
 uint64
 GetZenithCurrentClusterSize(void)
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 21a538fd603..a1b179d4be3 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -29,8 +29,6 @@
  *         safekeepers, learn start LSN of future epoch and run basebackup'
  *         won't work.
  *
- *      TODO: check that LSN on safekeepers after start is the same as it was
- *            after `postgres --sync-safekeepers`.
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -107,6 +105,8 @@ static TimestampTz last_reconnect_attempt;
 /* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
 static bool syncSafekeepers;
 
+static WalproposerShmemState *walprop_shared;
+
 /* Prototypes for private functions */
 static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
@@ -1208,20 +1208,16 @@ DetermineEpochStartLsn(void)
 	}
 
 	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
-	 * was committed yet. To keep the idea of always starting streaming since
-	 * record boundary (which simplifies decoding on safekeeper), take start
-	 * position of the slot. TODO: take it from .signal file.
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
+	 * committed yet. Start streaming then from the basebackup LSN.
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
-		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
-		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		propEpochStartLsn = truncateLsn = GetRedoStartLsn();
 		if (timelineStartLsn == InvalidXLogRecPtr)
 		{
-			timelineStartLsn = MyReplicationSlot->data.restart_lsn;
+			timelineStartLsn = GetRedoStartLsn();
 		}
-		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
 	}
 
@@ -1256,6 +1252,32 @@ DetermineEpochStartLsn(void)
 		 safekeeper[donor].host, safekeeper[donor].port,
 		 LSN_FORMAT_ARGS(truncateLsn)
 		);
+
+	/*
+	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since
+	 * which we are going to write according to the consensus. If not, we must
+	 * bail out, as clog and other non rel data is inconsistent.
+	 */
+	if (!syncSafekeepers)
+	{
+		if (propEpochStartLsn != GetRedoStartLsn())
+		{
+			/*
+			 * However, allow to proceed if previously elected leader was me; plain
+			 * restart of walproposer not intervened by concurrent compute (who could
+			 * generate WAL) is ok.
+			 */
+			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
+											walprop_shared->mineLastElectedTerm)))
+			{
+				elog(FATAL,
+					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					 LSN_FORMAT_ARGS(propEpochStartLsn),
+					 LSN_FORMAT_ARGS(GetRedoStartLsn()));
+			}
+		}
+		walprop_shared->mineLastElectedTerm = propTerm;
+	}
 }
 
 /*
@@ -1880,27 +1902,30 @@ GetAcknowledgedByQuorumWALPosition(void)
 	return responses[n_safekeepers - quorum];
 }
 
-
-static ZenithFeedbackState *zf_state;
-
 /*
  * ZenithFeedbackShmemSize --- report amount of shared memory space needed
  */
 Size
-ZenithFeedbackShmemSize(void)
+WalproposerShmemSize(void)
 {
-	return sizeof(ZenithFeedbackState);
+	return sizeof(WalproposerShmemState);
 }
 
 bool
-ZenithFeedbackShmemInit(void)
+WalproposerShmemInit(void)
 {
 	bool		found;
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	zf_state = ShmemInitStruct("Zenith Feedback",
-								sizeof(ZenithFeedbackState),
+	walprop_shared = ShmemInitStruct("Walproposer shared state",
+								sizeof(WalproposerShmemState),
 								&found);
+
+	if (!found)
+	{
+		memset(walprop_shared, 0, WalproposerShmemSize());
+		SpinLockInit(&walprop_shared->mutex);
+	}
 	LWLockRelease(AddinShmemInitLock);
 
 	return found;
@@ -1909,20 +1934,20 @@ ZenithFeedbackShmemInit(void)
 void
 zenith_feedback_set(ZenithFeedback *zf)
 {
-	SpinLockAcquire(&zf_state->mutex);
-	memcpy(&zf_state->feedback, zf, sizeof(ZenithFeedback));
-	SpinLockRelease(&zf_state->mutex);
+	SpinLockAcquire(&walprop_shared->mutex);
+	memcpy(&walprop_shared->feedback, zf, sizeof(ZenithFeedback));
+	SpinLockRelease(&walprop_shared->mutex);
 }
 
 
 void
 zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
-	SpinLockAcquire(&zf_state->mutex);
-	*writeLsn = zf_state->feedback.ps_writelsn;
-	*flushLsn = zf_state->feedback.ps_flushlsn;
-	*applyLsn = zf_state->feedback.ps_applylsn;
-	SpinLockRelease(&zf_state->mutex);
+	SpinLockAcquire(&walprop_shared->mutex);
+	*writeLsn = walprop_shared->feedback.ps_writelsn;
+	*flushLsn = walprop_shared->feedback.ps_flushlsn;
+	*applyLsn = walprop_shared->feedback.ps_applylsn;
+	SpinLockRelease(&walprop_shared->mutex);
 }
 
 
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 5fb07a87eb8..233bd081f82 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -152,7 +152,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
 
-		size = add_size(size, ZenithFeedbackShmemSize());
+		size = add_size(size, WalproposerShmemSize());
 
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
@@ -274,7 +274,7 @@ CreateSharedMemoryAndSemaphores(void)
 	SyncScanShmemInit();
 	AsyncShmemInit();
 
-	ZenithFeedbackShmemInit();
+	WalproposerShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f35e3686cf8..66fe9dfcd9e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -354,6 +354,8 @@ extern void RemovePromoteSignalFiles(void);
 extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
 extern XLogRecPtr GetLastWrittenPageLSN(void);
 
+extern XLogRecPtr GetRedoStartLsn(void);
+
 extern void SetZenithCurrentClusterSize(uint64 size);
 extern uint64 GetZenithCurrentClusterSize(void);
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 1fcaaa3fc11..09743380bc7 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -283,12 +283,12 @@ typedef	struct ZenithFeedback
 } ZenithFeedback;
 
 
-typedef struct ZenithFeedbackState
+typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
 	ZenithFeedback feedback;
-
-} ZenithFeedbackState;
+	term_t		mineLastElectedTerm;
+} WalproposerShmemState;
 
 /*
  * Report safekeeper state to proposer
@@ -393,8 +393,8 @@ void ParseZenithFeedbackMessage(StringInfo reply_message,
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 
-Size ZenithFeedbackShmemSize(void);
-bool ZenithFeedbackShmemInit(void);
+Size WalproposerShmemSize(void);
+bool WalproposerShmemInit(void);
 void zenith_feedback_set(ZenithFeedback *zf);
 void zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 

From be2d0e4a6f5fbd39880c1c3c97b397a3e779547d Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Fri, 23 Jul 2021 15:56:58 +0300
Subject: [PATCH 134/214] Implement pg_database_size(): - extend zenith
 pageserver API to handle new request type; - add dbsize_hook to intercept
 db_dir_size() call.

---
 contrib/zenith/libpagestore.c     |   1 +
 contrib/zenith/pagestore_client.h |  19 ++++++
 contrib/zenith/pagestore_smgr.c   | 100 ++++++++++++++++++++++++++++++
 src/backend/utils/adt/dbsize.c    |   9 +++
 src/include/storage/smgr.h        |   3 +
 5 files changed, 132 insertions(+)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index d8ec3eba81d..600e3d791c6 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -457,5 +457,6 @@ _PG_init(void)
 		zenith_log(PqPageStoreTrace, "set zenith_smgr hook");
 		smgr_hook = smgr_zenith;
 		smgr_init_hook = smgr_init_zenith;
+		dbsize_hook = zenith_dbsize;
 	}
 }
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index a5dcd1efc06..051dc6bc9a1 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -31,14 +31,18 @@ typedef enum
 	T_ZenithExistsRequest = 0,
 	T_ZenithNblocksRequest,
 	T_ZenithGetPageRequest,
+	T_ZenithDbSizeRequest,
 
 	/* pagestore -> pagestore_client */
 	T_ZenithExistsResponse = 100,
 	T_ZenithNblocksResponse,
 	T_ZenithGetPageResponse,
 	T_ZenithErrorResponse,
+	T_ZenithDbSizeResponse,
 } ZenithMessageTag;
 
+
+
 /* base struct for c-style inheritance */
 typedef struct
 {
@@ -75,6 +79,14 @@ typedef struct
 	ForkNumber	forknum;
 } ZenithNblocksRequest;
 
+
+typedef struct
+{
+	ZenithRequest req;
+	Oid dbNode;
+} ZenithDbSizeRequest;
+
+
 typedef struct
 {
 	ZenithRequest req;
@@ -107,6 +119,12 @@ typedef struct
 	char		page[FLEXIBLE_ARRAY_MEMBER];
 } ZenithGetPageResponse;
 
+typedef struct
+{
+	ZenithMessageTag tag;
+	int64		db_size;
+} ZenithDbSizeResponse;
+
 typedef struct
 {
 	ZenithMessageTag tag;
@@ -165,6 +183,7 @@ extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
 extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 							 BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern int64 zenith_dbsize(Oid dbNode);
 extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber nblocks);
 extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index caa77a59091..544250bb55d 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -143,6 +143,16 @@ zm_pack_request(ZenithRequest *msg)
 
 				break;
 			}
+		case T_ZenithDbSizeRequest:
+			{
+				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
+
+					pq_sendbyte(&s, msg_req->req.latest);
+					pq_sendint64(&s, msg_req->req.lsn);
+					pq_sendint32(&s, msg_req->dbNode);
+
+					break;
+			}
 		case T_ZenithGetPageRequest:
 			{
 				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
@@ -163,6 +173,7 @@ zm_pack_request(ZenithRequest *msg)
 		case T_ZenithNblocksResponse:
 		case T_ZenithGetPageResponse:
 		case T_ZenithErrorResponse:
+		case T_ZenithDbSizeResponse:
 		default:
 			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
 			break;
@@ -216,6 +227,18 @@ zm_unpack_response(StringInfo s)
 				break;
 			}
 
+		case T_ZenithDbSizeResponse:
+			{
+				ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse));
+
+				msg_resp->tag = tag;
+				msg_resp->db_size = pq_getmsgint64(s);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
 		case T_ZenithErrorResponse:
 			{
 				ZenithErrorResponse *msg_resp;
@@ -242,6 +265,7 @@ zm_unpack_response(StringInfo s)
 		case T_ZenithExistsRequest:
 		case T_ZenithNblocksRequest:
 		case T_ZenithGetPageRequest:
+		case T_ZenithDbSizeRequest:
 		default:
 			elog(ERROR, "unexpected zenith message tag 0x%02x", tag);
 			break;
@@ -309,6 +333,18 @@ zm_to_string(ZenithMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_ZenithDbSizeRequest:
+			{
+				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\"");
+				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
 
 			/* pagestore -> pagestore_client */
 		case T_ZenithExistsResponse:
@@ -356,6 +392,18 @@ zm_to_string(ZenithMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_ZenithDbSizeResponse:
+			{
+				ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\"");
+				appendStringInfo(&s, ", \"db_size\": %ld}",
+								 msg_resp->db_size
+					);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
 
 		default:
 			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
@@ -1286,6 +1334,58 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 	return n_blocks;
 }
 
+/*
+ *	zenith_db_size() -- Get the size of the database in bytes.
+ */
+int64
+zenith_dbsize(Oid dbNode)
+{
+	ZenithResponse *resp;
+	int64 db_size;
+	XLogRecPtr request_lsn;
+	bool		latest;
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithDbSizeRequest request = {
+			.req.tag = T_ZenithDbSizeRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
+			.dbNode = dbNode,
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag)
+	{
+		case T_ZenithDbSizeResponse:
+			db_size = ((ZenithDbSizeResponse *) resp)->db_size;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
+							dbNode,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s",
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+
+	elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+		 dbNode,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 db_size);
+
+	pfree(resp);
+	return db_size;
+}
+
 /*
  *	zenith_truncate() -- Truncate relation to specified number of blocks.
  */
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 33474e01941..9f4edbc60d8 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -77,6 +77,8 @@ db_dir_size(const char *path)
 	return dirsize;
 }
 
+dbsize_hook_type dbsize_hook = NULL;
+
 /*
  * calculate size of database in all tablespaces
  */
@@ -106,6 +108,13 @@ calculate_database_size(Oid dbOid)
 
 	/* Include pg_default storage */
 	snprintf(pathname, sizeof(pathname), "base/%u", dbOid);
+
+	if (dbsize_hook)
+	{
+		totalsize = (*dbsize_hook)(dbOid);
+		return totalsize;
+	}
+
 	totalsize = db_dir_size(pathname);
 
 	/* Scan the non-default tablespaces */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index c08eaed6179..4a0d6b2e09b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -131,6 +131,9 @@ extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook;
 extern void smgr_init_standard(void);
 extern void smgr_shutdown_standard(void);
 
+// Alternative implementation of calculate_database_size()
+typedef const int64 (*dbsize_hook_type) (Oid dbOid);
+extern PGDLLIMPORT dbsize_hook_type dbsize_hook;
 
 typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode);
 extern PGDLLIMPORT smgr_hook_type smgr_hook;

From cfe5f3cffa22d52bf5f00cb00dd650745acb1c1c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 May 2022 12:58:40 +0400
Subject: [PATCH 135/214] Shut down instance on basebackup LSN mismatch.

To force making basebackup again.
---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index a1b179d4be3..5d167ed3f9f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1270,7 +1270,7 @@ DetermineEpochStartLsn(void)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
-				elog(FATAL,
+				elog(PANIC,
 					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 					 LSN_FORMAT_ARGS(propEpochStartLsn),
 					 LSN_FORMAT_ARGS(GetRedoStartLsn()));

From 61d19f586c2ae9f89980ddaebf29cb806bf553b7 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 9 May 2022 22:08:10 +0300
Subject: [PATCH 136/214] Use compute-tools from the new neondatabase Docker
 Hub repo

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 496228cabcd..b9dade0d7a8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 #
 # Image with pre-built tools
 #
-FROM zenithdb/compute-tools:latest AS compute-deps
+FROM neondatabase/compute-tools:latest AS compute-deps
 # Only to get ready zenith_ctl binary as deppendency
 
 #

From caef1a45df5f304ebed9e6483a458a26f12274cd Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Tue, 10 May 2022 18:50:54 +0300
Subject: [PATCH 137/214] zenith_test_utils extension: add neon_xlogflush()

This function is to simplify complex WAL generation in https://github.com/neondatabase/neon/pull/1574

`pg_logical_emit_message` is the easiest way to get a big WAL record, but:
* If it's transactional, it gets `COMMIT` record right after
* If it's not, WAL is not flushed at all. The function helps here, so we
  don't rely on the background WAL writer.

I suspect the plain `xlogflush()` name may collide in the future, hence the prefix.
---
 .../zenith_test_utils/zenith_test_utils--1.0.sql    |  5 +++++
 contrib/zenith_test_utils/zenithtest.c              | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
index d595b043abf..adc821bcc13 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -22,3 +22,8 @@ CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'neon_xlogflush'
+LANGUAGE C PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index c1e2c1c92f4..d3616d633ed 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -20,6 +20,7 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "utils/builtins.h"
+#include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
 #include "zenith/pagestore_client.h"
@@ -32,6 +33,7 @@ PG_FUNCTION_INFO_V1(test_consume_xids);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
+PG_FUNCTION_INFO_V1(neon_xlogflush);
 
 /*
  * Linkage to functions in zenith module.
@@ -289,3 +291,14 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
+
+/*
+ * Directly calls XLogFlush(lsn) to flush WAL buffers.
+ */
+Datum
+neon_xlogflush(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr lsn = PG_GETARG_LSN(0);
+	XLogFlush(lsn);
+	PG_RETURN_VOID();
+}

From 27cd816110e9a7e877996c9370cf628a6cf9cc9e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 13 May 2022 18:29:32 +0300
Subject: [PATCH 138/214] Reduce noise in the logs from inmem_write()

I'm seeing a lot of these warnings from B-tree SPLIT records:

    WARNING:  inmem_write() called for 1663/12990/16397.0 blk 2630: used_pages 0
    CONTEXT:  WAL redo at 1/235A1B50 for Btree/SPLIT_R: level 0, firstrightoff 368, newitemoff 408, postingoff 0

That seems OK, replaying a split record legitimately accesses many buffers:
the left half, the right half, left sibling, right sibling, and child.

We could bump up 'temp_buffers' (currently 4), but I didn't do that
beceause it's also good to get some test coverage for the
inmem_smgr.c.
---
 contrib/zenith/inmem_smgr.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index 1d8aa9ac2ee..4eff64bf370 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -28,7 +28,11 @@
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 
-#define MAX_PAGES 32
+/* Size of the in-memory smgr */
+#define MAX_PAGES 64
+
+/* If more than WARN_PAGES are used, print a warning in the log */
+#define WARN_PAGES 32
 
 static BufferTag page_tag[MAX_PAGES];
 static char page_body[MAX_PAGES][BLCKSZ];
@@ -177,7 +181,15 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	pg = locate_page(reln, forknum, blocknum);
 	if (pg < 0)
 	{
-		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
+		/*
+		 * We assume the buffer cache is large enough to hold all the buffers
+		 * needed for most operations. Overflowing to this "in-mem smgr" in rare
+		 * cases is OK. But if we find that we're using more than WARN_PAGES,
+		 * print a warning so that we get alerted and get to investigate why
+		 * we're accessing so many buffers.
+		 */
+		elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
+			 "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
@@ -191,7 +203,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		used_pages++;
 		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
 	}  else {
-		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
+		elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,

From 62fd2a77a74a4594f678bcbffc87adc8dce48c5a Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 18 May 2022 12:14:28 +0400
Subject: [PATCH 139/214] Use compute_ctl instead of zenith_ctl (#162)

---
 Dockerfile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b9dade0d7a8..7f4710d3a5b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Image with pre-built tools
 #
 FROM neondatabase/compute-tools:latest AS compute-deps
-# Only to get ready zenith_ctl binary as deppendency
+# Only to get ready compute_ctl binary as deppendency
 
 #
 # Image with Postgres build deps
@@ -56,11 +56,14 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
 
 # Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/zenith_ctl /usr/local/bin/zenith_ctl
+COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
+
+# XXX: temporary symlink for compatibility with old control-plane
+RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 # Add postgres shared objects to the search path
 RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
 USER postgres
 
-ENTRYPOINT ["/usr/local/bin/zenith_ctl"]
+ENTRYPOINT ["/usr/local/bin/compute_ctl"]

From f6ab59e4287dfd3cc51fa4517408c3a25c1762f7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 24 May 2022 19:44:58 +0300
Subject: [PATCH 140/214] Improve error messages on seccomp loading errors.

At https://github.com/neondatabase/neon/pull/1783#issuecomment-1136144433,
Kirill saw case where the WAL redo process failed to open /dev/null.
That's pretty weird, and I have no idea what might be causing it, but
with this patch we'll at least get a little more details if it happens
again. This will print the OS error (with %m) if it happens, and also
distinguishes between the two error cases that previously both emitted
the 'failed to open a test file' error.
---
 src/backend/postmaster/seccomp.c | 39 ++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
index 03971a072cf..3ac21b02983 100644
--- a/src/backend/postmaster/seccomp.c
+++ b/src/backend/postmaster/seccomp.c
@@ -99,9 +99,6 @@ static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_act
 
 void seccomp_load_rules(PgSeccompRule *rules, int count)
 {
-#define raise_error(str) \
-	ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: " str)))
-
 	struct sigaction action = { .sa_flags = SA_SIGINFO };
 	PgSeccompRule rule;
 	long fd;
@@ -113,37 +110,51 @@ void seccomp_load_rules(PgSeccompRule *rules, int count)
 	 */
 	action.sa_sigaction = seccomp_test_sighandler;
 	if (sigaction(SIGSYS, &action, NULL) != 0)
-		raise_error("failed to install a test SIGSYS handler");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not install test SIGSYS handler")));
 
 	/*
 	 * First, check that open of a well-known file works.
 	 * XXX: We use raw syscall() to call the very open().
 	 */
 	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
-	if (fd < 0 || seccomp_test_sighandler_done)
-		raise_error("failed to open a test file");
-	close((int)fd);
+	if (seccomp_test_sighandler_done)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: signal handler test flag was set unexpectedly")));
+	if (fd < 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not open /dev/null for seccomp testing: %m")));
+	close((int) fd);
 
 	/* Set a trap on open() to test seccomp bpf */
 	rule = PG_SCMP(open, SCMP_ACT_TRAP);
 	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
-		raise_error("failed to load a test filter");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not load test trap")));
 
 	/* Finally, check that open() now raises SIGSYS */
-	(void)syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	(void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
 	if (!seccomp_test_sighandler_done)
-		raise_error("SIGSYS handler doesn't seem to work");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: SIGSYS handler doesn't seem to work")));
 
 	/* Now that everything seems to work, install a proper handler */
 	action.sa_sigaction = seccomp_deny_sighandler;
 	if (sigaction(SIGSYS, &action, NULL) != 0)
-		raise_error("failed to install a proper SIGSYS handler");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not install SIGSYS handler")));
 
 	/* If this succeeds, any syscall not in the list will crash the process */
 	if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
-		raise_error("failed to enter seccomp mode");
-
-#undef raise_error
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not enter seccomp mode")));
 }
 
 /*

From 289abdb93e056c1d6b01317d6b95c75309c3fa09 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 26 May 2022 21:16:21 +0300
Subject: [PATCH 141/214] Rename contrib/zenith to contrib/neon. Rename custom
 GUCs: - zenith.page_server_connstring -> neon.pageserver_connstring -
 zenith.zenith_tenant -> neon.tenant_id - zenith.zenith_timeline ->
 neon.timeline_id - zenith.max_cluster_size -> neon.max_cluster_size

---
 Dockerfile                                    |  2 +-
 contrib/{zenith => neon}/Makefile             | 14 +++++------
 contrib/{zenith => neon}/inmem_smgr.c         |  3 ++-
 contrib/{zenith => neon}/libpagestore.c       | 18 ++++++-------
 .../zenith--1.0.sql => neon/neon--1.0.sql}    |  2 +-
 contrib/{zenith/zenith.c => neon/neon.c}      |  6 ++---
 .../zenith.control => neon/neon.control}      |  4 +--
 contrib/{zenith => neon}/pagestore_client.h   |  2 +-
 contrib/{zenith => neon}/pagestore_smgr.c     |  6 ++---
 contrib/{zenith => neon}/relsize_cache.c      | 12 ++++-----
 contrib/neon_test_utils/Makefile              | 25 +++++++++++++++++++
 .../neon_test_utils--1.0.sql}                 |  2 +-
 .../neon_test_utils/neon_test_utils.control   |  5 ++++
 .../neontest.c}                               | 10 ++++----
 contrib/zenith_test_utils/Makefile            | 25 -------------------
 .../zenith_test_utils.control                 |  5 ----
 src/backend/replication/walproposer.c         | 10 ++++----
 src/backend/tcop/zenith_wal_redo.c            |  2 --
 src/backend/utils/misc/guc.c                  |  2 +-
 19 files changed, 77 insertions(+), 78 deletions(-)
 rename contrib/{zenith => neon}/Makefile (58%)
 rename contrib/{zenith => neon}/inmem_smgr.c (99%)
 rename contrib/{zenith => neon}/libpagestore.c (96%)
 rename contrib/{zenith/zenith--1.0.sql => neon/neon--1.0.sql} (85%)
 rename contrib/{zenith/zenith.c => neon/neon.c} (93%)
 rename contrib/{zenith/zenith.control => neon/neon.control} (54%)
 rename contrib/{zenith => neon}/pagestore_client.h (99%)
 rename contrib/{zenith => neon}/pagestore_smgr.c (99%)
 rename contrib/{zenith => neon}/relsize_cache.c (91%)
 create mode 100644 contrib/neon_test_utils/Makefile
 rename contrib/{zenith_test_utils/zenith_test_utils--1.0.sql => neon_test_utils/neon_test_utils--1.0.sql} (92%)
 create mode 100644 contrib/neon_test_utils/neon_test_utils.control
 rename contrib/{zenith_test_utils/zenithtest.c => neon_test_utils/neontest.c} (97%)
 delete mode 100644 contrib/zenith_test_utils/Makefile
 delete mode 100644 contrib/zenith_test_utils/zenith_test_utils.control

diff --git a/Dockerfile b/Dockerfile
index 7f4710d3a5b..db472efd5e9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,7 @@ RUN mkdir /pg/compute_build && cd /pg/compute_build && \
     # Install main binaries and contribs
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/zenith install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/neon install && \
     # Install headers
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
 
diff --git a/contrib/zenith/Makefile b/contrib/neon/Makefile
similarity index 58%
rename from contrib/zenith/Makefile
rename to contrib/neon/Makefile
index a4a60d7b88c..b6f3cf400ff 100644
--- a/contrib/zenith/Makefile
+++ b/contrib/neon/Makefile
@@ -1,17 +1,17 @@
-# contrib/zenith/Makefile
+# contrib/neon/Makefile
 
 
-MODULE_big = zenith
+MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o zenith.o
+	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o neon.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 
-EXTENSION = zenith
-DATA = zenith--1.0.sql
-PGFILEDESC = "zenith - cloud storage for PostgreSQL"
+EXTENSION = neon
+DATA = neon--1.0.sql
+PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
@@ -19,7 +19,7 @@ PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
 SHLIB_PREREQS = submake-libpq
-subdir = contrib/zenith
+subdir = contrib/neon
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
diff --git a/contrib/zenith/inmem_smgr.c b/contrib/neon/inmem_smgr.c
similarity index 99%
rename from contrib/zenith/inmem_smgr.c
rename to contrib/neon/inmem_smgr.c
index 4eff64bf370..7840292b08c 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/neon/inmem_smgr.c
@@ -16,7 +16,8 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  contrib/zenith/inmem_smgr.c
+ *	  contrib/neon/inmem_smgr.c
+ *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
diff --git a/contrib/zenith/libpagestore.c b/contrib/neon/libpagestore.c
similarity index 96%
rename from contrib/zenith/libpagestore.c
rename to contrib/neon/libpagestore.c
index 600e3d791c6..9d632527f44 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/neon/libpagestore.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	 contrib/zenith/libpqpagestore.c
+ *	 contrib/neon/libpqpagestore.c
  *
  *-------------------------------------------------------------------------
  */
@@ -374,7 +374,7 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
 void
 _PG_init(void)
 {
-	DefineCustomStringVariable("zenith.page_server_connstring",
+	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
 							   &page_server_connstring_raw,
@@ -383,7 +383,7 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);
 
-	DefineCustomStringVariable("zenith.callmemaybe_connstring",
+	DefineCustomStringVariable("neon.callmemaybe_connstring",
 							   "Connection string that Page Server or WAL safekeeper should use to connect to us",
 							   NULL,
 							   &callmemaybe_connstring,
@@ -392,7 +392,7 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);
 
-	DefineCustomStringVariable("zenith.zenith_timeline",
+	DefineCustomStringVariable("neon.timeline_id",
 							   "Zenith timelineid the server is running on",
 							   NULL,
 							   &zenith_timeline,
@@ -401,8 +401,8 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   check_zenith_id, NULL, NULL);
 
-	DefineCustomStringVariable("zenith.zenith_tenant",
-							   "Zenith tenantid the server is running on",
+	DefineCustomStringVariable("neon.tenant_id",
+							   "Neon tenantid the server is running on",
 							   NULL,
 							   &zenith_tenant,
 							   "",
@@ -410,7 +410,7 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   check_zenith_id, NULL, NULL);
 
-	DefineCustomBoolVariable("zenith.wal_redo",
+	DefineCustomBoolVariable("neon.wal_redo",
 							 "start in wal-redo mode",
 							 NULL,
 							 &wal_redo,
@@ -419,7 +419,7 @@ _PG_init(void)
 							 0,
 							 NULL, NULL, NULL);
 
-	DefineCustomIntVariable("zenith.max_cluster_size",
+	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
 							&max_cluster_size,
@@ -429,7 +429,7 @@ _PG_init(void)
 							NULL, NULL,	NULL);
 
 	relsize_hash_init();
-	EmitWarningsOnPlaceholders("zenith");
+	EmitWarningsOnPlaceholders("neon");
 
 	if (page_server != NULL)
 		zenith_log(ERROR, "libpqpagestore already loaded");
diff --git a/contrib/zenith/zenith--1.0.sql b/contrib/neon/neon--1.0.sql
similarity index 85%
rename from contrib/zenith/zenith--1.0.sql
rename to contrib/neon/neon--1.0.sql
index e414be8ceea..34f1ba78d4f 100644
--- a/contrib/zenith/zenith--1.0.sql
+++ b/contrib/neon/neon--1.0.sql
@@ -1,4 +1,4 @@
-\echo Use "CREATE EXTENSION zenith" to load this file. \quit
+\echo Use "CREATE EXTENSION neon" to load this file. \quit
 
 CREATE FUNCTION pg_cluster_size()
 RETURNS bigint
diff --git a/contrib/zenith/zenith.c b/contrib/neon/neon.c
similarity index 93%
rename from contrib/zenith/zenith.c
rename to contrib/neon/neon.c
index e88984d918c..f6bf6f40d26 100644
--- a/contrib/zenith/zenith.c
+++ b/contrib/neon/neon.c
@@ -1,10 +1,10 @@
 /*-------------------------------------------------------------------------
  *
- * zenith.c
- *	  Utility functions to expose zenith specific information to user
+ * neon.c
+ *	  Utility functions to expose neon specific information to user
  *
  * IDENTIFICATION
- *	 contrib/zenith/zenith.c
+ *	 contrib/neon/neon.c
  *
  *-------------------------------------------------------------------------
  */
diff --git a/contrib/zenith/zenith.control b/contrib/neon/neon.control
similarity index 54%
rename from contrib/zenith/zenith.control
rename to contrib/neon/neon.control
index 9aa5e2f067a..84f79881c1e 100644
--- a/contrib/zenith/zenith.control
+++ b/contrib/neon/neon.control
@@ -1,4 +1,4 @@
-# zenith extension
+# neon extension
 comment = 'cloud storage for PostgreSQL'
 default_version = '1.0'
-module_pathname = '$libdir/zenith'
+module_pathname = '$libdir/neon'
diff --git a/contrib/zenith/pagestore_client.h b/contrib/neon/pagestore_client.h
similarity index 99%
rename from contrib/zenith/pagestore_client.h
rename to contrib/neon/pagestore_client.h
index 051dc6bc9a1..eedc0864e90 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/neon/pagestore_client.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * contrib/zenith/pagestore_client.h
+ * contrib/neon/pagestore_client.h
  *
  *-------------------------------------------------------------------------
  */
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
similarity index 99%
rename from contrib/zenith/pagestore_smgr.c
rename to contrib/neon/pagestore_smgr.c
index 544250bb55d..8086143ed46 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -39,7 +39,7 @@
  *
  *
  * IDENTIFICATION
- *	  contrib/zenith/pagestore_smgr.c
+ *	  contrib/neon/pagestore_smgr.c
  *
  *-------------------------------------------------------------------------
  */
@@ -887,7 +887,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 				(errcode(ERRCODE_DISK_FULL),
 					errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
 						   max_cluster_size),
-					errhint("This limit is defined by zenith.max_cluster_size GUC")));
+					errhint("This limit is defined by neon.max_cluster_size GUC")));
 	}
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
@@ -1005,7 +1005,7 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 }
 
 /*
- * While function is defined in the zenith extension it's used within zenith_test_utils directly.
+ * While function is defined in the zenith extension it's used within neon_test_utils directly.
  * To avoid breaking tests in the runtime please keep function signature in sync.
  */
 void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
diff --git a/contrib/zenith/relsize_cache.c b/contrib/neon/relsize_cache.c
similarity index 91%
rename from contrib/zenith/relsize_cache.c
rename to contrib/neon/relsize_cache.c
index 993903b1b18..8dfcffe1d16 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/neon/relsize_cache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  contrib/zenith/relsize_cache.c
+ *	  contrib/neon/relsize_cache.c
  *
  *-------------------------------------------------------------------------
  */
@@ -57,10 +57,10 @@ zenith_smgr_shmem_startup(void)
 		prev_shmem_startup_hook();
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId) GetNamedLWLockTranche("zenith_relsize");
+	relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
 	info.keysize = sizeof(RelTag);
 	info.entrysize = sizeof(RelSizeEntry);
-	relsize_hash = ShmemInitHash("zenith_relsize",
+	relsize_hash = ShmemInitHash("neon_relsize",
 								 relsize_hash_size, relsize_hash_size,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
@@ -145,8 +145,8 @@ forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
 void
 relsize_hash_init(void)
 {
-	DefineCustomIntVariable("zenith.relsize_hash_size",
-							"Sets the maximum number of cached relation sizes for zenith",
+	DefineCustomIntVariable("neon.relsize_hash_size",
+							"Sets the maximum number of cached relation sizes for neon",
 							NULL,
 							&relsize_hash_size,
 							DEFAULT_RELSIZE_HASH_SIZE,
@@ -159,7 +159,7 @@ relsize_hash_init(void)
 	if (relsize_hash_size > 0)
 	{
 		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
-		RequestNamedLWLockTranche("zenith_relsize", 1);
+		RequestNamedLWLockTranche("neon_relsize", 1);
 
 		prev_shmem_startup_hook = shmem_startup_hook;
 		shmem_startup_hook = zenith_smgr_shmem_startup;
diff --git a/contrib/neon_test_utils/Makefile b/contrib/neon_test_utils/Makefile
new file mode 100644
index 00000000000..bd618e6d96e
--- /dev/null
+++ b/contrib/neon_test_utils/Makefile
@@ -0,0 +1,25 @@
+# contrib/neon_test_utils/Makefile
+
+
+MODULE_big = neon_test_utils
+OBJS = \
+	$(WIN32RES) \
+	neontest.o
+
+EXTENSION = neon_test_utils
+DATA = neon_test_utils--1.0.sql
+PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
+
+EXTRA_INSTALL=contrib/neon
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+PG_CPPFLAGS = -I$(top_srcdir)/contrib
+subdir = contrib/neon_test_utils
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/neon_test_utils/neon_test_utils--1.0.sql
similarity index 92%
rename from contrib/zenith_test_utils/zenith_test_utils--1.0.sql
rename to contrib/neon_test_utils/neon_test_utils--1.0.sql
index adc821bcc13..402981a9a66 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/neon_test_utils/neon_test_utils--1.0.sql
@@ -1,5 +1,5 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION zenith_test_utils" to load this file. \quit
+\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit
 
 CREATE FUNCTION test_consume_xids(nxids int)
 RETURNS VOID
diff --git a/contrib/neon_test_utils/neon_test_utils.control b/contrib/neon_test_utils/neon_test_utils.control
new file mode 100644
index 00000000000..94e67205039
--- /dev/null
+++ b/contrib/neon_test_utils/neon_test_utils.control
@@ -0,0 +1,5 @@
+# neon_test_utils extension
+comment = 'helpers for neon testing and debugging'
+default_version = '1.0'
+module_pathname = '$libdir/neon_test_utils'
+relocatable = true
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/neon_test_utils/neontest.c
similarity index 97%
rename from contrib/zenith_test_utils/zenithtest.c
rename to contrib/neon_test_utils/neontest.c
index d3616d633ed..a3e730efe27 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/neon_test_utils/neontest.c
@@ -1,10 +1,10 @@
 /*-------------------------------------------------------------------------
  *
- * zenithtest.c
- *	  Helpers for zenith testing and debugging
+ * neontest.c
+ *	  Helpers for neon testing and debugging
  *
  * IDENTIFICATION
- *	 contrib/zenith_test_utils/zenithtest.c
+ *	 contrib/neon_test_utils/neontest.c
  *
  *-------------------------------------------------------------------------
  */
@@ -23,7 +23,7 @@
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
-#include "zenith/pagestore_client.h"
+#include "neon/pagestore_client.h"
 
 PG_MODULE_MAGIC;
 
@@ -53,7 +53,7 @@ _PG_init(void)
 	/* Asserts verify that typedefs above match original declarations */
 	AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
 	zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
-		load_external_function("$libdir/zenith", "zenith_read_at_lsn",
+		load_external_function("$libdir/neon", "zenith_read_at_lsn",
 							   true, NULL);
 }
 
diff --git a/contrib/zenith_test_utils/Makefile b/contrib/zenith_test_utils/Makefile
deleted file mode 100644
index 5b2fcdc18fe..00000000000
--- a/contrib/zenith_test_utils/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-# contrib/zenith_test_utils/Makefile
-
-
-MODULE_big = zenith_test_utils
-OBJS = \
-	$(WIN32RES) \
-	zenithtest.o
-
-EXTENSION = zenith_test_utils
-DATA = zenith_test_utils--1.0.sql
-PGFILEDESC = "zenith_test_utils - helpers for zenith testing and debugging"
-
-EXTRA_INSTALL=contrib/zenith
-
-ifdef USE_PGXS
-PG_CONFIG = pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-else
-PG_CPPFLAGS = -I$(top_srcdir)/contrib
-subdir = contrib/zenith_test_utils
-top_builddir = ../..
-include $(top_builddir)/src/Makefile.global
-include $(top_srcdir)/contrib/contrib-global.mk
-endif
diff --git a/contrib/zenith_test_utils/zenith_test_utils.control b/contrib/zenith_test_utils/zenith_test_utils.control
deleted file mode 100644
index 9b947b63966..00000000000
--- a/contrib/zenith_test_utils/zenith_test_utils.control
+++ /dev/null
@@ -1,5 +0,0 @@
-# zenith_test_utils extension
-comment = 'helpers for zenith testing and debugging'
-default_version = '1.0'
-module_pathname = '$libdir/zenith_test_utils'
-relocatable = true
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5d167ed3f9f..917ab5d294b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -388,7 +388,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-	load_file("zenith", false);
+	load_file("neon", false);
 
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
@@ -437,15 +437,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
 	greetRequest.systemId = systemId;
 	if (!zenith_timeline_walproposer)
-		elog(FATAL, "zenith.zenith_timeline is not provided");
+		elog(FATAL, "neon.timeline_id is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
 		!HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+		elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer);
 	if (!zenith_tenant_walproposer)
-		elog(FATAL, "zenith.zenith_tenant is not provided");
+		elog(FATAL, "neon.tenant_id is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
 		!HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+		elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer);
 
 	greetRequest.timeline = ThisTimeLineID;
 	greetRequest.walSegSize = wal_segment_size;
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 68f29564328..96e1058c406 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -10,8 +10,6 @@
  * processes. Instead, we wait for command from 'stdin', and respond to
  * 'stdout'.
  *
- * There's a TAP test for this in contrib/zenith_store/t/002_wal_redo_helper.pl
- *
  * The protocol through stdin/stdout is loosely based on the libpq protocol.
  * The process accepts messages through stdin, and each message has the format:
  *
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7d449ba9fa2..cc45bd3241b 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2127,7 +2127,7 @@ static struct config_bool ConfigureNamesBool[] =
 	},
 
 	{
-		{"zenith_test_evict", PGC_POSTMASTER, UNGROUPED,
+		{"neon_test_evict", PGC_POSTMASTER, UNGROUPED,
 			gettext_noop("Evict unpinned pages (for better test coverage)"),
 		},
 		&zenith_test_evict,

From d0298545cb31083cbb53b861a020e04a9266659d Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <lubennikovaav@gmail.com>
Date: Tue, 19 Apr 2022 15:36:25 +0300
Subject: [PATCH 142/214] Rename 'wal_acceptors' GUC to 'safekeepers'

---
 src/backend/utils/misc/guc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index cc45bd3241b..500aa672611 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4673,8 +4673,8 @@ static struct config_string ConfigureNamesString[] =
 	},
 
 	{
-		{"wal_acceptors", PGC_POSTMASTER, UNGROUPED,
-			gettext_noop("List of Zenith WAL acceptors (host:port)"),
+		{"safekeepers", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("List of Neon WAL acceptors (host:port)"),
 			NULL,
 			GUC_LIST_INPUT | GUC_LIST_QUOTE
 		},

From a249d3303c7b57c32d1c28d146e8231850183dc1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 31 May 2022 13:34:06 +0400
Subject: [PATCH 143/214] Fix basebackup LSN comparison in walproposer.

as basebackup LSN always skips over page header
---
 src/backend/replication/walproposer.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 917ab5d294b..2b9b034afa2 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1159,6 +1159,21 @@ GetEpoch(Safekeeper *sk)
 	return GetHighestTerm(&sk->voteResponse.termHistory);
 }
 
+/* If LSN points to the page header, skip it */
+static XLogRecPtr
+SkipXLogPageHeader(XLogRecPtr lsn)
+{
+	if (XLogSegmentOffset(lsn, wal_segment_size) == 0)
+	{
+		lsn += SizeOfXLogLongPHD;
+	}
+	else if (lsn % XLOG_BLCKSZ == 0)
+	{
+		lsn += SizeOfXLogShortPHD;
+	}
+	return lsn;
+}
+
 /*
  * Called after majority of acceptors gave votes, it calculates the most
  * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
@@ -1260,7 +1275,13 @@ DetermineEpochStartLsn(void)
 	 */
 	if (!syncSafekeepers)
 	{
-		if (propEpochStartLsn != GetRedoStartLsn())
+		/*
+		 *  Basebackup LSN always points to the beginning of the record (not the
+		 *  page), as StartupXLOG most probably wants it this way. Safekeepers
+		 *  don't skip header as they need continious stream of data, so
+		 *  correct LSN for comparison.
+		 */
+		if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn())
 		{
 			/*
 			 * However, allow to proceed if previously elected leader was me; plain

From 678e58d346af525ac91b3f926d98739b7593cef7 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 22:22:04 +0300
Subject: [PATCH 144/214] Allow specifiyng the different compute-tools tag
 (#170)

Part of https://github.com/neondatabase/neon/pull/1838
---
 Dockerfile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index db472efd5e9..a5112e1b4d1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,11 @@
+# Allow specifiyng the different compute-tools tag, so we were able to always use
+# the locally built image.
+ARG COMPUTE_TOOLS_TAG=latest
+
 #
 # Image with pre-built tools
 #
-FROM neondatabase/compute-tools:latest AS compute-deps
+FROM neondatabase/compute-tools:$COMPUTE_TOOLS_TAG AS compute-deps
 # Only to get ready compute_ctl binary as deppendency
 
 #

From e6747973d12d5cc45fa6583a4ed71c54b68bee49 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 6 Jun 2022 18:41:28 +0300
Subject: [PATCH 145/214] Do not allocate shared memory for wal_redo process
 (#165)

* Do not allocate shared memory for wal_redo process

* Add comment
---
 src/backend/port/sysv_shmem.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 35cce89e9c9..b649a9fecdf 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -155,6 +155,12 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 		}
 	}
 #endif
+	/*
+	 * NEON: do not create shared memory segments for single user wal redo postgres.
+	 * Many spawned instances of wal redo may exhaust kernel.shmmni
+	 */
+	if (am_wal_redo_postgres)
+		return valloc(size);
 
 	shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
 

From 2c6f54f88ce5dca9e7e0e4d44aaa01b94fc4747f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 8 Jun 2022 16:24:23 +0300
Subject: [PATCH 146/214] Add check for NULL for malloc in
 InternalIpcMemoryCreate (#173)

* Add check for NULL for malloc in InternalIpcMemoryCreate

* apply pgindent
---
 src/backend/port/sysv_shmem.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index b649a9fecdf..2dc4ec3d26e 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -155,13 +155,22 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 		}
 	}
 #endif
+
 	/*
-	 * NEON: do not create shared memory segments for single user wal redo postgres.
-	 * Many spawned instances of wal redo may exhaust kernel.shmmni
+	 * NEON: do not create shared memory segments for single user wal redo
+	 * postgres. Many spawned instances of wal redo may exhaust kernel.shmmni
 	 */
 	if (am_wal_redo_postgres)
-		return valloc(size);
+	{
+		void	   *ptr = malloc(size);
 
+		if (ptr == NULL)
+		{
+			ereport(FATAL,
+					(errmsg("could not create shared memory segment with size %zu for WAL redo process", size)));
+		}
+		return ptr;
+	}
 	shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
 
 	if (shmid < 0)

From 040daf5bd53970254cf8d46c6c0846763c90c8bd Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <mail4score@gmail.com>
Date: Sat, 11 Jun 2022 00:43:58 +0300
Subject: [PATCH 147/214] Rename ZenithFeedback (#174)

---
 contrib/neon/neon.c                   |  2 +-
 src/backend/replication/walproposer.c | 90 +++++++++++++--------------
 src/backend/replication/walsender.c   | 30 ++++-----
 src/include/replication/walproposer.h | 20 +++---
 4 files changed, 71 insertions(+), 71 deletions(-)

diff --git a/contrib/neon/neon.c b/contrib/neon/neon.c
index f6bf6f40d26..c7c176dba7a 100644
--- a/contrib/neon/neon.c
+++ b/contrib/neon/neon.c
@@ -49,7 +49,7 @@ backpressure_lsns(PG_FUNCTION_ARGS)
 	bool		nulls[3];
 	TupleDesc	tupdesc;
 
-	zenith_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+	replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
 
 	tupdesc = CreateTemplateTupleDesc(3);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 2b9b034afa2..803bd09bf2b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1783,9 +1783,9 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }
 
-/* Parse a ZenithFeedback message, or the ZenithFeedback part of an AppendResponse */
+/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
 void
-ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
+ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf)
 {
 	uint8 nkeys;
 	int i;
@@ -1800,42 +1800,42 @@ ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
 		if (strcmp(key, "current_timeline_size") == 0)
 		{
 				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				zf->currentClusterSize = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseZenithFeedbackMessage: current_timeline_size %lu",
-					zf->currentClusterSize);
+				rf->currentClusterSize = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
+					rf->currentClusterSize);
 		}
 		else if (strcmp(key, "ps_writelsn") == 0)
 		{
 				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				zf->ps_writelsn = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_writelsn %X/%X",
-					LSN_FORMAT_ARGS(zf->ps_writelsn));
+				rf->ps_writelsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
+					LSN_FORMAT_ARGS(rf->ps_writelsn));
 		}
 		else if (strcmp(key, "ps_flushlsn") == 0)
 		{
 				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				zf->ps_flushlsn = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_flushlsn %X/%X",
-					LSN_FORMAT_ARGS(zf->ps_flushlsn));
+				rf->ps_flushlsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
+					LSN_FORMAT_ARGS(rf->ps_flushlsn));
 		}
 		else if (strcmp(key, "ps_applylsn") == 0)
 		{
 				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				zf->ps_applylsn = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_applylsn %X/%X",
-					LSN_FORMAT_ARGS(zf->ps_applylsn));
+				rf->ps_applylsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
+					LSN_FORMAT_ARGS(rf->ps_applylsn));
 		}
 		else if (strcmp(key, "ps_replytime") == 0)
 		{
 			pq_getmsgint(reply_message, sizeof(int32)); // read value length
-			zf->ps_replytime = pq_getmsgint64(reply_message);
+			rf->ps_replytime = pq_getmsgint64(reply_message);
 			{
 				char	   *replyTimeStr;
 
 				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(zf->ps_replytime));
-				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_replytime %lu reply_time: %s",
-					zf->ps_replytime, replyTimeStr);
+				replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
+				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
+					rf->ps_replytime, replyTimeStr);
 
 				pfree(replyTimeStr);
 			}
@@ -1844,7 +1844,7 @@ ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
 		{
 			len = pq_getmsgint(reply_message, sizeof(int32)); // read value length
 			// Skip unknown keys to support backward compatibile protocol changes
-			elog(LOG, "ParseZenithFeedbackMessage: unknown key: %s len %d", key, len);
+			elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1924,7 +1924,7 @@ GetAcknowledgedByQuorumWALPosition(void)
 }
 
 /*
- * ZenithFeedbackShmemSize --- report amount of shared memory space needed
+ * ReplicationFeedbackShmemSize --- report amount of shared memory space needed
  */
 Size
 WalproposerShmemSize(void)
@@ -1953,16 +1953,16 @@ WalproposerShmemInit(void)
 }
 
 void
-zenith_feedback_set(ZenithFeedback *zf)
+replication_feedback_set(ReplicationFeedback *rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, zf, sizeof(ZenithFeedback));
+	memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
 	SpinLockRelease(&walprop_shared->mutex);
 }
 
 
 void
-zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
+replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
 	*writeLsn = walprop_shared->feedback.ps_writelsn;
@@ -1973,37 +1973,37 @@ zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr
 
 
 /*
- * Get ZenithFeedback fields from the most advanced safekeeper
+ * Get ReplicationFeedback fields from the most advanced safekeeper
  */
 static void
-GetLatestZentihFeedback(ZenithFeedback *zf)
+GetLatestZentihFeedback(ReplicationFeedback *rf)
 {
 	int latest_safekeeper = 0;
 	XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
 	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (safekeeper[i].appendResponse.zf.ps_writelsn > ps_writelsn)
+		if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
 		{
 			latest_safekeeper = i;
-			ps_writelsn = safekeeper[i].appendResponse.zf.ps_writelsn;
+			ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
 		}
 	}
 
-	zf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.zf.currentClusterSize;
-	zf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_writelsn;
-	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
-	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
-	zf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.zf.ps_replytime;
+	rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
+	rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
+	rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
+	rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
+	rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
 
 	elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu,"
 			  " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
-		zf->currentClusterSize,
-		LSN_FORMAT_ARGS(zf->ps_writelsn),
-		LSN_FORMAT_ARGS(zf->ps_flushlsn),
-		LSN_FORMAT_ARGS(zf->ps_applylsn),
-		zf->ps_replytime);
+		rf->currentClusterSize,
+		LSN_FORMAT_ARGS(rf->ps_writelsn),
+		LSN_FORMAT_ARGS(rf->ps_flushlsn),
+		LSN_FORMAT_ARGS(rf->ps_applylsn),
+		rf->ps_replytime);
 
-	zenith_feedback_set(zf);
+	replication_feedback_set(rf);
 }
 
 static void
@@ -2016,16 +2016,16 @@ HandleSafekeeperResponse(void)
 
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = quorumFeedback.zf.ps_flushlsn;
+	diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
 
 	if (!syncSafekeepers)
 	{
-		// Get ZenithFeedback fields from the most advanced safekeeper
-		GetLatestZentihFeedback(&quorumFeedback.zf);
-		SetZenithCurrentClusterSize(quorumFeedback.zf.currentClusterSize);
+		// Get ReplicationFeedback fields from the most advanced safekeeper
+		GetLatestZentihFeedback(&quorumFeedback.rf);
+		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
 	}
 
-	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.zf.ps_flushlsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
 	{
 
 		if (minQuorumLsn > quorumFeedback.flushLsn)
@@ -2039,7 +2039,7 @@ HandleSafekeeperResponse(void)
 								//flush_lsn - This is what durably stored in WAL service.
 								quorumFeedback.flushLsn,
 								//apply_lsn - This is what processed and durably saved at pageserver.
-								quorumFeedback.zf.ps_flushlsn,
+								quorumFeedback.rf.ps_flushlsn,
 								GetCurrentTimestamp(), false);
 	}
 
@@ -2222,7 +2222,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 			msg->hs.xmin.value = pq_getmsgint64_le(&s);
 			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
 			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-				ParseZenithFeedbackMessage(&s, &msg->zf);
+				ParseReplicationFeedbackMessage(&s, &msg->rf);
 			pq_getmsgend(&s);
 			return true;
 		}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index a3dc11c41bc..a841beebf3f 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -239,7 +239,7 @@ void StartReplication(StartReplicationCmd *cmd);
 static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
-static void ProcessZenithFeedbackMessage(void);
+static void ProcessReplicationFeedbackMessage(void);
 static void ProcessStandbyHSFeedbackMessage(void);
 static void ProcessRepliesIfAny(void);
 static void ProcessPendingWrites(void);
@@ -1878,7 +1878,7 @@ ProcessStandbyMessage(void)
 			break;
 
 		case 'z':
-			ProcessZenithFeedbackMessage();
+			ProcessReplicationFeedbackMessage();
 			break;
 
 		default:
@@ -1953,25 +1953,25 @@ ProcessStandbyReplyMessage(void)
 					LSN_FORMAT_ARGS(applyPtr));
 }
 
-// This message is a zenith extension of postgres replication protocol
+// This message is a neon extension of postgres replication protocol
 static void
-ProcessZenithFeedbackMessage(void)
+ProcessReplicationFeedbackMessage(void)
 {
-	ZenithFeedback zf;
+	ReplicationFeedback rf;
 
 	// consume message length
 	pq_getmsgint64(&reply_message);
 
-	ParseZenithFeedbackMessage(&reply_message, &zf);
+	ParseReplicationFeedbackMessage(&reply_message, &rf);
 
-	zenith_feedback_set(&zf);
+	replication_feedback_set(&rf);
 
-	SetZenithCurrentClusterSize(zf.currentClusterSize);
+	SetZenithCurrentClusterSize(rf.currentClusterSize);
 
-	ProcessStandbyReply(zf.ps_writelsn,
-						zf.ps_flushlsn,
-						zf.ps_applylsn,
-						zf.ps_replytime,
+	ProcessStandbyReply(rf.ps_writelsn,
+						rf.ps_flushlsn,
+						rf.ps_applylsn,
+						rf.ps_replytime,
 						false);
 }
 
@@ -2058,7 +2058,7 @@ ProcessStandbyReply(XLogRecPtr	writePtr,
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 
-	/* 
+	/*
 	 * walproposer use trunclateLsn instead of flushPtr for confirmed
 	 * received location, so we shouldn't update restart_lsn here.
 	 */
@@ -3863,10 +3863,10 @@ backpressure_lag(void)
 		XLogRecPtr applyPtr;
 		XLogRecPtr myFlushLsn = GetFlushRecPtr();
 
-		zenith_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
 		#define MB ((XLogRecPtr)1024*1024)
 
-		elog(DEBUG2, "current flushLsn %X/%X ZenithFeedback: write %X/%X flush %X/%X apply %X/%X",
+		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
 			LSN_FORMAT_ARGS(myFlushLsn),
 			LSN_FORMAT_ARGS(writePtr),
 			LSN_FORMAT_ARGS(flushPtr),
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 09743380bc7..c5a5b76268e 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -271,7 +271,7 @@ typedef struct HotStandbyFeedback
 } HotStandbyFeedback;
 
 
-typedef	struct ZenithFeedback
+typedef	struct ReplicationFeedback
 {
 	// current size of the timeline on pageserver
 	uint64 currentClusterSize;
@@ -280,13 +280,13 @@ typedef	struct ZenithFeedback
 	XLogRecPtr ps_flushlsn;
 	XLogRecPtr ps_applylsn;
 	TimestampTz ps_replytime;
-} ZenithFeedback;
+} ReplicationFeedback;
 
 
 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	ZenithFeedback feedback;
+	ReplicationFeedback feedback;
 	term_t		mineLastElectedTerm;
 } WalproposerShmemState;
 
@@ -310,12 +310,12 @@ typedef struct AppendResponse
 	// Feedback recieved from pageserver includes standby_status_update fields
 	// and custom zenith feedback.
 	// This part of the message is extensible.
-	ZenithFeedback zf;
+	ReplicationFeedback rf;
 } AppendResponse;
 
-// ZenithFeedback is extensible part of the message that is parsed separately
+// ReplicationFeedback is extensible part of the message that is parsed separately
 // Other fields are fixed part
-#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, zf)
+#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
 
 
 /*
@@ -388,15 +388,15 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									uint32		feedbackEpoch,
 									TransactionId feedbackCatalogXmin,
 									uint32		feedbackCatalogEpoch);
-void ParseZenithFeedbackMessage(StringInfo reply_message,
-								ZenithFeedback *zf);
+void ParseReplicationFeedbackMessage(StringInfo reply_message,
+								ReplicationFeedback *rf);
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 
 Size WalproposerShmemSize(void);
 bool WalproposerShmemInit(void);
-void zenith_feedback_set(ZenithFeedback *zf);
-void zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+void replication_feedback_set(ReplicationFeedback *rf);
+void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 /* libpqwalproposer hooks & helper type */
 

From 43aaebb1da9ad793b28e9c5c366819246ab9fe21 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <mail4score@gmail.com>
Date: Sat, 11 Jun 2022 00:44:43 +0300
Subject: [PATCH 148/214] Remove callmemaybe bits from compute (#172)

---
 contrib/neon/libpagestore.c     | 28 ----------------------------
 contrib/neon/pagestore_client.h |  1 -
 contrib/neon/pagestore_smgr.c   |  1 -
 3 files changed, 30 deletions(-)

diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c
index 9d632527f44..268415d4c61 100644
--- a/contrib/neon/libpagestore.c
+++ b/contrib/neon/libpagestore.c
@@ -71,25 +71,6 @@ zenith_connect()
 				 errdetail_internal("%s", msg)));
 	}
 
-	/* Ask the Page Server to connect to us, and stream WAL from us. */
-	if (callmemaybe_connstring && callmemaybe_connstring[0]
-		&& zenith_tenant
-		&& zenith_timeline)
-	{
-		PGresult   *res;
-
-		query = psprintf("callmemaybe %s %s %s", zenith_tenant, zenith_timeline, callmemaybe_connstring);
-		res = PQexec(pageserver_conn, query);
-		if (PQresultStatus(res) != PGRES_COMMAND_OK)
-		{
-			PQfinish(pageserver_conn);
-			pageserver_conn = NULL;
-			zenith_log(ERROR,
-					   "[ZENITH_SMGR] callmemaybe command failed");
-		}
-		PQclear(res);
-	}
-
 	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
 	ret = PQsendQuery(pageserver_conn, query);
 	if (ret != 1)
@@ -383,15 +364,6 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);
 
-	DefineCustomStringVariable("neon.callmemaybe_connstring",
-							   "Connection string that Page Server or WAL safekeeper should use to connect to us",
-							   NULL,
-							   &callmemaybe_connstring,
-							   "",
-							   PGC_POSTMASTER,
-							   0,	/* no flags required */
-							   NULL, NULL, NULL);
-
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Zenith timelineid the server is running on",
 							   NULL,
diff --git a/contrib/neon/pagestore_client.h b/contrib/neon/pagestore_client.h
index eedc0864e90..93ea6771eb9 100644
--- a/contrib/neon/pagestore_client.h
+++ b/contrib/neon/pagestore_client.h
@@ -147,7 +147,6 @@ typedef struct
 extern page_server_api *page_server;
 
 extern char *page_server_connstring;
-extern char *callmemaybe_connstring;
 extern char *zenith_timeline;
 extern char *zenith_tenant;
 extern bool wal_redo;
diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 8086143ed46..5fdfea5e487 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -88,7 +88,6 @@ page_server_api *page_server;
 
 /* GUCs */
 char	   *page_server_connstring; // with substituted password
-char	   *callmemaybe_connstring;
 char	   *zenith_timeline;
 char	   *zenith_tenant;
 bool		wal_redo = false;

From 1cd83b5b903d4750d803e71d0f7cdc2974fd769a Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 22 Jun 2022 11:00:06 +0300
Subject: [PATCH 149/214] Panic on receiving higher term (#176)

---
 src/backend/replication/walproposer.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 803bd09bf2b..4b0567be707 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1762,6 +1762,14 @@ RecvAppendResponses(Safekeeper *sk)
 						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
 						sk->host, sk->port)));
 
+		if (sk->appendResponse.term > propTerm)
+		{
+			/* Another compute with higher term is running. */
+			elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+					sk->host, sk->port,
+					sk->appendResponse.term, propTerm);
+		}
+
 		readAnything = true;
 	}
 

From fac89ab34f504df2cdd0a9474706dc17b7e9368e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 16 May 2022 13:08:11 +0300
Subject: [PATCH 150/214] Misc cleanup in libpagestore.c.

- Fix typos
- Change Zenith -> Neon in the ZENITH_SMGR tag that's printed in error
  messages that is user-visible, and in various function names and comments
  that are not user-visible.
- pgindent
- Remove comment about zm_to_string() leaking memory. It doesn't.
- Re-word some error messages to match PostgreSQL error message style guide
- Cleanup logging style
- Don't print JWT token to log
---
 contrib/neon/libpagestore.c | 110 +++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c
index 268415d4c61..2621421532a 100644
--- a/contrib/neon/libpagestore.c
+++ b/contrib/neon/libpagestore.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * libpqpagestore.c
+ * libpagestore.c
  *	  Handles network communications with the remote pagestore.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
@@ -32,11 +32,11 @@ PG_MODULE_MAGIC;
 
 void		_PG_init(void);
 
-#define PqPageStoreTrace DEBUG5
+#define PageStoreTrace DEBUG5
 
-#define ZENITH_TAG "[ZENITH_SMGR] "
-#define zenith_log(tag, fmt, ...) ereport(tag, \
-		(errmsg(ZENITH_TAG fmt, ## __VA_ARGS__), \
+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag, \
+		(errmsg(NEON_TAG fmt, ## __VA_ARGS__), \
 		 errhidestmt(true), errhidecontext(true)))
 
 bool		connected = false;
@@ -44,13 +44,13 @@ PGconn	   *pageserver_conn = NULL;
 
 char	   *page_server_connstring_raw;
 
-static ZenithResponse *zenith_call(ZenithRequest *request);
+static ZenithResponse *pageserver_call(ZenithRequest *request);
 page_server_api api = {
-	.request = zenith_call
+	.request = pageserver_call
 };
 
 static void
-zenith_connect()
+pageserver_connect()
 {
 	char	   *query;
 	int			ret;
@@ -67,7 +67,7 @@ zenith_connect()
 		pageserver_conn = NULL;
 		ereport(ERROR,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg("[ZENITH_SMGR] could not establish connection"),
+				 errmsg(NEON_TAG "could not establish connection to pageserver"),
 				 errdetail_internal("%s", msg)));
 	}
 
@@ -77,8 +77,7 @@ zenith_connect()
 	{
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
-		zenith_log(ERROR,
-				   "[ZENITH_SMGR] failed to start dispatcher_loop on pageserver");
+		neon_log(ERROR, "could not send pagestream command to pageserver");
 	}
 
 	while (PQisBusy(pageserver_conn))
@@ -105,14 +104,13 @@ zenith_connect()
 				PQfinish(pageserver_conn);
 				pageserver_conn = NULL;
 
-				zenith_log(ERROR, "[ZENITH_SMGR] failed to get handshake from pageserver: %s",
-						   msg);
+				neon_log(ERROR, "could not complete handshake with pageserver: %s",
+						 msg);
 			}
 		}
 	}
 
-	// FIXME: when auth is enabled this ptints JWT to logs
-	zenith_log(LOG, "libpqpagestore: connected to '%s'", page_server_connstring);
+	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
 
 	connected = true;
 }
@@ -126,7 +124,7 @@ call_PQgetCopyData(PGconn *conn, char **buffer)
 	int			ret;
 
 retry:
-	ret = PQgetCopyData(conn, buffer, 1 /* async */);
+	ret = PQgetCopyData(conn, buffer, 1 /* async */ );
 
 	if (ret == 0)
 	{
@@ -146,8 +144,8 @@ call_PQgetCopyData(PGconn *conn, char **buffer)
 		if (wc & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(conn))
-				zenith_log(ERROR, "could not get response from pageserver: %s",
-						   PQerrorMessage(conn));
+				neon_log(ERROR, "could not get response from pageserver: %s",
+						 PQerrorMessage(conn));
 		}
 
 		goto retry;
@@ -158,7 +156,7 @@ call_PQgetCopyData(PGconn *conn, char **buffer)
 
 
 static ZenithResponse *
-zenith_call(ZenithRequest *request)
+pageserver_call(ZenithRequest *request)
 {
 	StringInfoData req_buff;
 	StringInfoData resp_buff;
@@ -175,7 +173,7 @@ zenith_call(ZenithRequest *request)
 		}
 
 		if (!connected)
-			zenith_connect();
+			pageserver_connect();
 
 		req_buff = zm_pack_request(request);
 
@@ -184,21 +182,21 @@ zenith_call(ZenithRequest *request)
 		 *
 		 * In principle, this could block if the output buffer is full, and we
 		 * should use async mode and check for interrupts while waiting. In
-		 * practice, our requests are small enough to always fit in the output and
-		 * TCP buffer.
+		 * practice, our requests are small enough to always fit in the output
+		 * and TCP buffer.
 		 */
 		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
 		{
-			zenith_log(ERROR, "failed to send page request: %s",
-					   PQerrorMessage(pageserver_conn));
+			neon_log(ERROR, "failed to send page request: %s",
+					 PQerrorMessage(pageserver_conn));
 		}
 		pfree(req_buff.data);
 
-		if (message_level_is_interesting(PqPageStoreTrace))
+		if (message_level_is_interesting(PageStoreTrace))
 		{
 			char	   *msg = zm_to_string((ZenithMessage *) request);
 
-			zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
+			neon_log(PageStoreTrace, "sent request: %s", msg);
 			pfree(msg);
 		}
 
@@ -207,25 +205,20 @@ zenith_call(ZenithRequest *request)
 		resp_buff.cursor = 0;
 
 		if (resp_buff.len == -1)
-			zenith_log(ERROR, "end of COPY");
+			neon_log(ERROR, "end of COPY");
 		else if (resp_buff.len == -2)
-			zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 
 		resp = zm_unpack_response(&resp_buff);
 		PQfreemem(resp_buff.data);
 
-		if (message_level_is_interesting(PqPageStoreTrace))
+		if (message_level_is_interesting(PageStoreTrace))
 		{
 			char	   *msg = zm_to_string((ZenithMessage *) resp);
 
-			zenith_log(PqPageStoreTrace, "Got response: %s", msg);
+			neon_log(PageStoreTrace, "got response: %s", msg);
 			pfree(msg);
 		}
-
-		/*
-		 * XXX: zm_to_string leak strings. Check with what memory contex all this
-		 * methods are called.
-		 */
 	}
 	PG_CATCH();
 	{
@@ -238,7 +231,7 @@ zenith_call(ZenithRequest *request)
 		 */
 		if (connected)
 		{
-			zenith_log(LOG, "dropping connection to page server due to error");
+			neon_log(LOG, "dropping connection to page server due to error");
 			PQfinish(pageserver_conn);
 			pageserver_conn = NULL;
 			connected = false;
@@ -271,11 +264,13 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
 	PQconninfoOption *conn_options;
 	PQconninfoOption *conn_option;
 	MemoryContext oldcontext;
+
 	/*
-	 * Here we substitute password in connection string with an environment variable.
-	 * To simplify things we construct a connection string back with only known options.
-	 * In particular: host port user and password. We do not currently use other options and
-	 * constructing full connstring in an URI shape is quite messy.
+	 * Here we substitute password in connection string with an environment
+	 * variable. To simplify things we construct a connection string back with
+	 * only known options. In particular: host port user and password. We do
+	 * not currently use other options and constructing full connstring in an
+	 * URI shape is quite messy.
 	 */
 
 	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
@@ -302,15 +297,18 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
 	 */
 	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
 	{
-		if (strcmp(conn_option->keyword, "host") == 0) {
+		if (strcmp(conn_option->keyword, "host") == 0)
+		{
 			if (conn_option->val != NULL && conn_option->val[0] != '\0')
 				host = conn_option->val;
 		}
-		else if (strcmp(conn_option->keyword, "port") == 0) {
+		else if (strcmp(conn_option->keyword, "port") == 0)
+		{
 			if (conn_option->val != NULL && conn_option->val[0] != '\0')
 				port = conn_option->val;
 		}
-		else if (strcmp(conn_option->keyword, "user") == 0) {
+		else if (strcmp(conn_option->keyword, "user") == 0)
+		{
 			if (conn_option->val != NULL && conn_option->val[0] != '\0')
 				user = conn_option->val;
 		}
@@ -324,7 +322,7 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
 							(errcode(ERRCODE_CONNECTION_EXCEPTION),
 							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
 
-				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
+				neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
 				auth_token = getenv(&conn_option->val[1]);
 				if (!auth_token)
 				{
@@ -334,12 +332,16 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
 				}
 				else
 				{
-					zenith_log(LOG, "using auth token from environment passed via env");
+					neon_log(LOG, "using auth token from environment passed via env");
 				}
 			}
 		}
 	}
-	// allocate connection string in a TopMemoryContext to make sure it is not freed
+
+	/*
+	 * allocate connection string in TopMemoryContext to make sure it is not
+	 * freed
+	 */
 	oldcontext = CurrentMemoryContext;
 	MemoryContextSwitchTo(TopMemoryContext);
 	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
@@ -398,15 +400,15 @@ _PG_init(void)
 							-1, -1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MB,
-							NULL, NULL,	NULL);
+							NULL, NULL, NULL);
 
 	relsize_hash_init();
 	EmitWarningsOnPlaceholders("neon");
 
 	if (page_server != NULL)
-		zenith_log(ERROR, "libpqpagestore already loaded");
+		neon_log(ERROR, "libpagestore already loaded");
 
-	zenith_log(PqPageStoreTrace, "libpqpagestore already loaded");
+	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;
 
 	/* substitute password in pageserver_connstring */
@@ -415,18 +417,22 @@ _PG_init(void)
 	/* Is there more correct way to pass CustomGUC to postgres code? */
 	zenith_timeline_walproposer = zenith_timeline;
 	zenith_tenant_walproposer = zenith_tenant;
-	/* Walproposer instructcs safekeeper which pageserver to use for replication */
+
+	/*
+	 * Walproposer instructs safekeeper which pageserver to use for
+	 * replication
+	 */
 	zenith_pageserver_connstring_walproposer = page_server_connstring;
 
 	if (wal_redo)
 	{
-		zenith_log(PqPageStoreTrace, "set inmem_smgr hook");
+		neon_log(PageStoreTrace, "set inmem_smgr hook");
 		smgr_hook = smgr_inmem;
 		smgr_init_hook = smgr_init_inmem;
 	}
 	else if (page_server_connstring && page_server_connstring[0])
 	{
-		zenith_log(PqPageStoreTrace, "set zenith_smgr hook");
+		neon_log(PageStoreTrace, "set neon_smgr hook");
 		smgr_hook = smgr_zenith;
 		smgr_init_hook = smgr_init_zenith;
 		dbsize_hook = zenith_dbsize;

From fadb42166f30d832a14245397ec3251833df8919 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 9 Jul 2022 16:30:41 +0300
Subject: [PATCH 151/214] Large last written lsn cache (#177)

Maintain cache of last written LSN for each relation segment (8 Mb).
---
 contrib/neon/pagestore_smgr.c            |  31 ++--
 src/backend/access/gin/gininsert.c       |   2 +-
 src/backend/access/gist/gistbuild.c      |   8 +-
 src/backend/access/spgist/spginsert.c    |   3 +-
 src/backend/access/transam/xlog.c        | 186 ++++++++++++++++++++---
 src/backend/commands/dbcommands.c        |   4 +-
 src/backend/replication/walsender.c      |   7 +
 src/backend/storage/lmgr/lwlocknames.txt |   1 +
 src/backend/utils/misc/guc.c             |  10 ++
 src/include/access/xlog.h                |   5 +-
 10 files changed, 221 insertions(+), 36 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 5fdfea5e487..a8b73c5f342 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -84,6 +84,11 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
+/*
+ * Pseudo block number used to associate LSN with relation metadata (relation size)
+ */
+#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
+
 page_server_api *page_server;
 
 /* GUCs */
@@ -558,7 +563,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenPageLSN(lsn);
+	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, blocknum, blocknum);
 }
 
 
@@ -603,7 +608,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool *latest)
+zenith_get_request_lsn(bool *latest, Oid rnode, BlockNumber blkno)
 {
 	XLogRecPtr	lsn;
 
@@ -630,9 +635,9 @@ zenith_get_request_lsn(bool *latest)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN();
+		lsn = GetLastWrittenLSN(rnode, blkno);
 		Assert(lsn != InvalidXLogRecPtr);
-		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
+		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 
 		lsn = zm_adjust_lsn(lsn);
@@ -716,7 +721,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		ZenithExistsRequest request = {
 			.req.tag = T_ZenithExistsRequest,
@@ -791,7 +796,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
+	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
 	 * relation created, so if we didn't remember the size in the relsize
 	 * cache, we might call smgrnblocks() on the newly-created relation before
 	 * the creation WAL record hass been received by the page server.
@@ -904,6 +909,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
+
+	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
 }
 
 /*
@@ -1079,7 +1086,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, blkno);
 	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1284,7 +1291,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		ZenithNblocksRequest request = {
 			.req.tag = T_ZenithNblocksRequest,
@@ -1344,7 +1351,7 @@ zenith_dbsize(Oid dbNode)
 	XLogRecPtr request_lsn;
 	bool		latest;
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, InvalidOid, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		ZenithDbSizeRequest request = {
 			.req.tag = T_ZenithDbSizeRequest,
@@ -1431,7 +1438,11 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	SetLastWrittenPageLSN(lsn);
+	/*
+	 * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them,
+	 * either update LSN for "dummy" metadata block. Second approach seems to be more efficient.
+	 */
+	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index dfad28d1f61..ea358d2038e 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -421,8 +421,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, 0, RelationGetNumberOfBlocks(index));
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 55a194a691f..73f8bb92bd9 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -335,9 +335,10 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
+			SetLastWrittenLSN(XactLastRecEnd,
+							  index->rd_smgr->smgr_rnode.node.relNode,
+							  0, RelationGetNumberOfBlocks(index));
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd);
-
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
@@ -467,7 +468,8 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn);
+		SetLastWrittenLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode,
+							  GIST_ROOT_BLKNO, GIST_ROOT_BLKNO);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index a7608f4d54c..fec27816765 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -143,8 +143,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode,
+						  0, RelationGetNumberOfBlocks(index));
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 4cf4e43b75b..d1bcdbb763b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -113,6 +113,7 @@ int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 bool		track_wal_io_timing = false;
 uint64      predefined_sysidentifier;
+int			lastWrittenLsnCacheSize;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -182,6 +183,28 @@ const struct config_enum_entry recovery_target_action_options[] = {
 	{NULL, 0, false}
 };
 
+
+/*
+ * We are not taken in account dbnode, spcnode, forknum fields of
+ * relation tag, because possibility of collision is assumed to be small
+ * and should not affect performance. And reducing cache key size speed-up
+ * hash calculation and comparison.
+ */
+typedef struct LastWrittenLsnCacheKey
+{
+	Oid         relid;
+	BlockNumber bucket;
+} LastWrittenLsnCacheKey;
+
+typedef struct LastWrittenLsnCacheEntry
+{
+	LastWrittenLsnCacheKey key;
+	XLogRecPtr             lsn;
+	/* L2-List for LRU replacement algorithm */
+	struct LastWrittenLsnCacheEntry* next;
+	struct LastWrittenLsnCacheEntry* prev;
+} LastWrittenLsnCacheEntry;
+
 /*
  * Statistics for current checkpoint are collected in this global struct.
  * Because only the checkpointer or a stand-alone backend can perform
@@ -751,6 +774,17 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 	XLogRecPtr  lastWrittenPageLSN;
 
+	/*
+	 * Maximal last written LSN for pages not present in lastWrittenLsnCache
+	 */
+	XLogRecPtr  maxLastWrittenLsn;
+
+	/*
+	 * Double linked list to implement LRU replacement policy for last written LSN cache.
+	 * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
+	 */
+	LastWrittenLsnCacheEntry lastWrittenLsnLRU;
+
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
 	XLogRecPtr	RedoStartLSN;
 
@@ -762,6 +796,7 @@ typedef struct XLogCtlData
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
+
 static XLogCtlData *XLogCtl = NULL;
 
 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -772,6 +807,19 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
  */
 static ControlFileData *ControlFile = NULL;
 
+#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
+
+
+/*
+ * Cache of last written LSN for each relation chunk (hash bucket).
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
+ * relation metadata update.
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
+ * pages are replaced using LRU algorithm, based on L2-list.
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
+ */
+static HTAB *lastWrittenLsnCache;
+
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
  * multiple evaluation!
@@ -5143,11 +5191,8 @@ LocalProcessControlFile(bool reset)
 	ReadControlFile();
 }
 
-/*
- * Initialization of shared memory for XLOG
- */
-Size
-XLOGShmemSize(void)
+static Size
+XLOGCtlShmemSize(void)
 {
 	Size		size;
 
@@ -5187,6 +5232,16 @@ XLOGShmemSize(void)
 	return size;
 }
 
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
+{
+	return XLOGCtlShmemSize() +
+		hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry));
+}
+
 void
 XLOGShmemInit(void)
 {
@@ -5216,6 +5271,15 @@ XLOGShmemInit(void)
 	XLogCtl = (XLogCtlData *)
 		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
 
+	{
+		static HASHCTL info;
+		info.keysize = sizeof(LastWrittenLsnCacheKey);
+		info.entrysize = sizeof(LastWrittenLsnCacheEntry);
+		lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
+											lastWrittenLsnCacheSize, lastWrittenLsnCacheSize,
+											&info,
+											HASH_ELEM | HASH_BLOBS);
+	}
 	localControlFile = ControlFile;
 	ControlFile = (ControlFileData *)
 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
@@ -8113,7 +8177,8 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-	XLogCtl->lastWrittenPageLSN = EndOfLog;
+	XLogCtl->maxLastWrittenLsn = EndOfLog;
+	XLogCtl->lastWrittenLsnLRU.next = XLogCtl->lastWrittenLsnLRU.prev = &XLogCtl->lastWrittenLsnLRU;
 
 	LocalSetXLogInsertAllowed();
 
@@ -8889,29 +8954,116 @@ GetInsertRecPtr(void)
 }
 
 /*
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
+ * It returns an upper bound for the last written LSN of a given page,
+ * either from a cached last written LSN or a global maximum last written LSN.
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
  */
 XLogRecPtr
-GetLastWrittenPageLSN(void)
+GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
 {
 	XLogRecPtr lsn;
-	SpinLockAcquire(&XLogCtl->info_lck);
-	lsn = XLogCtl->lastWrittenPageLSN;
-	SpinLockRelease(&XLogCtl->info_lck);
+	LastWrittenLsnCacheEntry* entry;
+
+	LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
+
+	/* Maximal last written LSN among all non-cached pages */
+	lsn = XLogCtl->maxLastWrittenLsn;
+
+	if (rnode != InvalidOid)
+	{
+		LastWrittenLsnCacheKey key;
+		key.relid = rnode;
+		key.bucket = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET;
+		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
+		if (entry != NULL)
+			lsn = entry->lsn;
+	}
+	else
+	{
+		HASH_SEQ_STATUS seq;
+		/* Find maximum of all cached LSNs */
+		hash_seq_init(&seq, lastWrittenLsnCache);
+		while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL)
+		{
+			if (entry->lsn > lsn)
+				lsn = entry->lsn;
+		}
+	}
+	LWLockRelease(LastWrittenLsnLock);
 
 	return lsn;
 }
 
 /*
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
+ * SetLastWrittenLSN -- Set maximal LSN of written page.
+ * We maintain cache of last written LSNs with limited size and LRU replacement
+ * policy. To reduce cache size we store max LSN not for each page, but for
+ * bucket (1024 blocks). This cache allows to use old LSN when
+ * requesting pages of unchanged or appended relations.
+ *
+ * rnode can be InvalidOid, in this case maxLastWrittenLsn is updated. SetLastWrittensn with InvalidOid
+ * is used by createdb and dbase_redo functions.
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn)
+SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
 {
-	SpinLockAcquire(&XLogCtl->info_lck);
-	if (lsn > XLogCtl->lastWrittenPageLSN)
-		XLogCtl->lastWrittenPageLSN = lsn;
-	SpinLockRelease(&XLogCtl->info_lck);
+	if (lsn == InvalidXLogRecPtr)
+		return;
+
+	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
+	if (rnode == InvalidOid)
+	{
+		if (lsn > XLogCtl->maxLastWrittenLsn)
+			XLogCtl->maxLastWrittenLsn = lsn;
+	}
+	else
+	{
+		LastWrittenLsnCacheEntry* entry;
+		LastWrittenLsnCacheKey key;
+		bool found;
+		BlockNumber bucket;
+
+		key.relid = rnode;
+		for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
+			 bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET;
+			 bucket++)
+		{
+			key.bucket = bucket;
+			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
+			if (found)
+			{
+				if (lsn > entry->lsn)
+					entry->lsn = lsn;
+				/* Unlink from LRU list */
+				entry->next->prev = entry->prev;
+				entry->prev->next = entry->next;
+			}
+			else
+			{
+				entry->lsn = lsn;
+				if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize)
+				{
+					/* Replace least recently used entry */
+					LastWrittenLsnCacheEntry* victim = XLogCtl->lastWrittenLsnLRU.prev;
+					/* Adjust max LSN for not cached relations/chunks if needed */
+					if (victim->lsn > XLogCtl->maxLastWrittenLsn)
+						XLogCtl->maxLastWrittenLsn = victim->lsn;
+
+					victim->next->prev = victim->prev;
+					victim->prev->next = victim->next;
+					hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL);
+				}
+			}
+			/* Link to the head of LRU list */
+			entry->next = XLogCtl->lastWrittenLsnLRU.next;
+			entry->prev = &XLogCtl->lastWrittenLsnLRU;
+			XLogCtl->lastWrittenLsnLRU.next = entry->next->prev = entry;
+		}
+	}
+	LWLockRelease(LastWrittenLsnLock);
 }
 
 /*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 97a58988fb0..8c96a43f47f 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -685,7 +685,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn);
+				SetLastWrittenLSN(lsn, InvalidOid, 0, 0);
 			}
 		}
 		table_endscan(scan);
@@ -2364,7 +2364,7 @@ dbase_redo(XLogReaderState *record)
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
 
-			SetLastWrittenPageLSN(lsn);
+			SetLastWrittenLSN(lsn, InvalidOid, 0, 0);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index a841beebf3f..8f9de58d158 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2058,6 +2058,13 @@ ProcessStandbyReply(XLogRecPtr	writePtr,
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 
+	/* 
+	 * walproposer use trunclateLsn instead of flushPtr for confirmed
+	 * received location, so we shouldn't update restart_lsn here.
+	 */
+	if (am_wal_proposer)
+		return;
+
 	/*
 	 * walproposer use trunclateLsn instead of flushPtr for confirmed
 	 * received location, so we shouldn't update restart_lsn here.
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c2956..b4652c33ff6 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+LastWrittenLsnLock					48
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 500aa672611..cd2dd8c491f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2379,6 +2379,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"lsn_cache_size", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Size of las written LSN cache used by Neon."),
+			NULL
+		},
+		&lastWrittenLsnCacheSize,
+		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 66fe9dfcd9e..cd4e6c7f876 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -132,6 +132,7 @@ extern char *PrimaryConnInfo;
 extern char *PrimarySlotName;
 extern bool wal_receiver_create_temp_slot;
 extern bool track_wal_io_timing;
+extern int  lastWrittenLsnCacheSize;
 
 /* indirectly set via GUC system */
 extern TransactionId recoveryTargetXid;
@@ -351,8 +352,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
-extern XLogRecPtr GetLastWrittenPageLSN(void);
+extern void SetLastWrittenLSN(XLogRecPtr lsn, Oid relfilenode, BlockNumber from, BlockNumber till);
+extern XLogRecPtr GetLastWrittenLSN(Oid relfilenode, BlockNumber blkno);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From 561bee3ba7531a9af49a7e26dd36d730099e2409 Mon Sep 17 00:00:00 2001
From: MMeent <boekewurm@gmail.com>
Date: Tue, 12 Jul 2022 17:53:08 +0200
Subject: [PATCH 152/214] Add uuid-ossp to the supported extensions (#181)

* Add uuid-ossp to the supported extensions

Also update compile flags to `-O2` to trade compile time for PostgreSQL performance, and removes --enable-cassert.
---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a5112e1b4d1..126aa1d5e0d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,7 @@ FROM neondatabase/compute-tools:$COMPUTE_TOOLS_TAG AS compute-deps
 FROM debian:buster-slim AS build-deps
 
 RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libcurl4-openssl-dev
+                                          libcurl4-openssl-dev libossp-uuid-dev
 
 #
 # Image with built Postgres
@@ -30,7 +30,7 @@ COPY . /pg/
 
 # Build and install Postgres locally
 RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O0 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --enable-cassert --enable-depend && \
+    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --enable-uuid=ossp && \
     # Install main binaries and contribs
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \

From a96326cef8e9c1c629c97f40f724c13f864e3b18 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 19 Jul 2022 07:50:00 +0300
Subject: [PATCH 153/214] Update last written LSN for gin/gist index metadata
 (#182)

* Update last written LSN for gin/gist index metadata

* Replace SetLastWrittenLSN with family of SetLastWrittenLSNFFor* functions
---
 contrib/neon/pagestore_smgr.c         | 11 +++------
 src/backend/access/gin/gininsert.c    |  3 ++-
 src/backend/access/gist/gistbuild.c   |  8 ++++---
 src/backend/access/spgist/spginsert.c |  3 ++-
 src/backend/access/transam/xlog.c     | 34 ++++++++++++++++++++++++---
 src/backend/commands/dbcommands.c     |  5 ++--
 src/include/access/xlog.h             | 10 +++++++-
 7 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index a8b73c5f342..1beef4c95ec 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -84,11 +84,6 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
-/*
- * Pseudo block number used to associate LSN with relation metadata (relation size)
- */
-#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
-
 page_server_api *page_server;
 
 /* GUCs */
@@ -563,7 +558,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, blocknum, blocknum);
+	SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node.relNode, blocknum);
 }
 
 
@@ -910,7 +905,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
 
-	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
+	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node.relNode);
 }
 
 /*
@@ -1442,7 +1437,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them,
 	 * either update LSN for "dummy" metadata block. Second approach seems to be more efficient.
 	 */
-	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
+	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node.relNode);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index ea358d2038e..75ea7c846a6 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -421,7 +421,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
-		SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 	}
 
 	smgr_end_unlogged_build(index->rd_smgr);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 73f8bb92bd9..e3afb688177 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -335,9 +335,10 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
-			SetLastWrittenLSN(XactLastRecEnd,
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd,
 							  index->rd_smgr->smgr_rnode.node.relNode,
 							  0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 		}
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
@@ -468,8 +469,9 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode,
-							  GIST_ROOT_BLKNO, GIST_ROOT_BLKNO);
+		SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode,
+								  GIST_ROOT_BLKNO);
+		SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index fec27816765..55ea935642c 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -143,8 +143,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
-		SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode,
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode,
 						  0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 	}
 
 	smgr_end_unlogged_build(index->rd_smgr);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d1bcdbb763b..dfc0b4bb71c 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8998,17 +8998,18 @@ GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
 }
 
 /*
- * SetLastWrittenLSN -- Set maximal LSN of written page.
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
  * We maintain cache of last written LSNs with limited size and LRU replacement
  * policy. To reduce cache size we store max LSN not for each page, but for
  * bucket (1024 blocks). This cache allows to use old LSN when
  * requesting pages of unchanged or appended relations.
  *
- * rnode can be InvalidOid, in this case maxLastWrittenLsn is updated. SetLastWrittensn with InvalidOid
+ * rnode can be InvalidOid, in this case maxLastWrittenLsn is updated. 
+ * SetLastWrittenLsn with InvalidOid
  * is used by createdb and dbase_redo functions.
  */
 void
-SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
+SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
 {
 	if (lsn == InvalidXLogRecPtr)
 		return;
@@ -9066,6 +9067,33 @@ SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
 	LWLockRelease(LastWrittenLsnLock);
 }
 
+/*
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
+ */
+void
+SetLastWrittenLSNForBlock(XLogRecPtr lsn, Oid rnode, BlockNumber blkno)
+{
+	SetLastWrittenLSNForBlockRange(lsn, rnode, blkno, blkno);
+}
+
+/*
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
+ */
+void
+SetLastWrittenLSNForRelation(XLogRecPtr lsn, Oid rnode)
+{
+	SetLastWrittenLSNForBlock(lsn, rnode, REL_METADATA_PSEUDO_BLOCKNO);
+}
+
+/*
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
+ */
+void
+SetLastWrittenLSNForDatabase(XLogRecPtr lsn)
+{
+	SetLastWrittenLSNForBlock(lsn, InvalidOid, 0);
+}
+
 /*
  * RedoStartLsn is set only once by startup process, locking is not required
  * after its exit.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 8c96a43f47f..943cd8a696d 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -685,7 +685,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenLSN(lsn, InvalidOid, 0, 0);
+				SetLastWrittenLSNForDatabase(lsn);
 			}
 		}
 		table_endscan(scan);
@@ -2363,8 +2363,7 @@ dbase_redo(XLogReaderState *record)
 		 */
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
-
-			SetLastWrittenLSN(lsn, InvalidOid, 0, 0);
+			SetLastWrittenLSNForDatabase(lsn);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index cd4e6c7f876..9cc214352f2 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -31,6 +31,11 @@ extern int	sync_method;
 
 extern PGDLLIMPORT TimeLineID ThisTimeLineID;	/* current TLI */
 
+/*
+ * Pseudo block number used to associate LSN with relation metadata (relation size)
+ */
+#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
+
 /*
  * Prior to 8.4, all activity during recovery was carried out by the startup
  * process. This local variable continues to be used in many parts of the
@@ -352,7 +357,10 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenLSN(XLogRecPtr lsn, Oid relfilenode, BlockNumber from, BlockNumber till);
+extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, Oid relfilenode, BlockNumber blkno);
+extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, Oid relfilenode, BlockNumber from, BlockNumber till);
+extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn);
+extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, Oid relfilenode);
 extern XLogRecPtr GetLastWrittenLSN(Oid relfilenode, BlockNumber blkno);
 
 extern XLogRecPtr GetRedoStartLsn(void);

From fd0465d41c09a0e53f5e55fbd68c04082e125a17 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 26 Jul 2022 13:40:44 +0300
Subject: [PATCH 154/214] Revert "Update last written LSN for gin/gist index
 metadata (#182)" (#183)

This reverts commit 7517d1cec45224841eac327cad7e0ddc81c734ff.

Revert "Large last written lsn cache (#177)"

This reverts commit 595ac69260719d8d7b43c09ab7dfd8f232542e50.
---
 contrib/neon/pagestore_smgr.c            |  26 ++-
 src/backend/access/gin/gininsert.c       |   3 +-
 src/backend/access/gist/gistbuild.c      |  10 +-
 src/backend/access/spgist/spginsert.c    |   4 +-
 src/backend/access/transam/xlog.c        | 214 ++---------------------
 src/backend/commands/dbcommands.c        |   5 +-
 src/backend/replication/walsender.c      |   7 -
 src/backend/storage/lmgr/lwlocknames.txt |   1 -
 src/backend/utils/misc/guc.c             |  10 --
 src/include/access/xlog.h                |  13 +-
 10 files changed, 37 insertions(+), 256 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 1beef4c95ec..5fdfea5e487 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -558,7 +558,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node.relNode, blocknum);
+	SetLastWrittenPageLSN(lsn);
 }
 
 
@@ -603,7 +603,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool *latest, Oid rnode, BlockNumber blkno)
+zenith_get_request_lsn(bool *latest)
 {
 	XLogRecPtr	lsn;
 
@@ -630,9 +630,9 @@ zenith_get_request_lsn(bool *latest, Oid rnode, BlockNumber blkno)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenLSN(rnode, blkno);
+		lsn = GetLastWrittenPageLSN();
 		Assert(lsn != InvalidXLogRecPtr);
-		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 
 		lsn = zm_adjust_lsn(lsn);
@@ -716,7 +716,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
 			.req.tag = T_ZenithExistsRequest,
@@ -791,7 +791,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
+	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
 	 * relation created, so if we didn't remember the size in the relsize
 	 * cache, we might call smgrnblocks() on the newly-created relation before
 	 * the creation WAL record hass been received by the page server.
@@ -904,8 +904,6 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
-
-	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node.relNode);
 }
 
 /*
@@ -1081,7 +1079,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, blkno);
+	request_lsn = zenith_get_request_lsn(&latest);
 	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1286,7 +1284,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithNblocksRequest request = {
 			.req.tag = T_ZenithNblocksRequest,
@@ -1346,7 +1344,7 @@ zenith_dbsize(Oid dbNode)
 	XLogRecPtr request_lsn;
 	bool		latest;
 
-	request_lsn = zenith_get_request_lsn(&latest, InvalidOid, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithDbSizeRequest request = {
 			.req.tag = T_ZenithDbSizeRequest,
@@ -1433,11 +1431,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	/*
-	 * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them,
-	 * either update LSN for "dummy" metadata block. Second approach seems to be more efficient.
-	 */
-	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node.relNode);
+	SetLastWrittenPageLSN(lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 75ea7c846a6..dfad28d1f61 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -421,9 +421,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
-		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, 0, RelationGetNumberOfBlocks(index));
-		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 	}
+	SetLastWrittenPageLSN(XactLastRecEnd);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index e3afb688177..55a194a691f 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -335,11 +335,9 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
-			SetLastWrittenLSNForBlockRange(XactLastRecEnd,
-							  index->rd_smgr->smgr_rnode.node.relNode,
-							  0, RelationGetNumberOfBlocks(index));
-			SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 		}
+		SetLastWrittenPageLSN(XactLastRecEnd);
+
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
@@ -469,9 +467,7 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode,
-								  GIST_ROOT_BLKNO);
-		SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode);
+		SetLastWrittenPageLSN(lsn);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 55ea935642c..a7608f4d54c 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -143,10 +143,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
-		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode,
-						  0, RelationGetNumberOfBlocks(index));
-		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 	}
+	SetLastWrittenPageLSN(XactLastRecEnd);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index dfc0b4bb71c..4cf4e43b75b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -113,7 +113,6 @@ int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 bool		track_wal_io_timing = false;
 uint64      predefined_sysidentifier;
-int			lastWrittenLsnCacheSize;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -183,28 +182,6 @@ const struct config_enum_entry recovery_target_action_options[] = {
 	{NULL, 0, false}
 };
 
-
-/*
- * We are not taken in account dbnode, spcnode, forknum fields of
- * relation tag, because possibility of collision is assumed to be small
- * and should not affect performance. And reducing cache key size speed-up
- * hash calculation and comparison.
- */
-typedef struct LastWrittenLsnCacheKey
-{
-	Oid         relid;
-	BlockNumber bucket;
-} LastWrittenLsnCacheKey;
-
-typedef struct LastWrittenLsnCacheEntry
-{
-	LastWrittenLsnCacheKey key;
-	XLogRecPtr             lsn;
-	/* L2-List for LRU replacement algorithm */
-	struct LastWrittenLsnCacheEntry* next;
-	struct LastWrittenLsnCacheEntry* prev;
-} LastWrittenLsnCacheEntry;
-
 /*
  * Statistics for current checkpoint are collected in this global struct.
  * Because only the checkpointer or a stand-alone backend can perform
@@ -774,17 +751,6 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 	XLogRecPtr  lastWrittenPageLSN;
 
-	/*
-	 * Maximal last written LSN for pages not present in lastWrittenLsnCache
-	 */
-	XLogRecPtr  maxLastWrittenLsn;
-
-	/*
-	 * Double linked list to implement LRU replacement policy for last written LSN cache.
-	 * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
-	 */
-	LastWrittenLsnCacheEntry lastWrittenLsnLRU;
-
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
 	XLogRecPtr	RedoStartLSN;
 
@@ -796,7 +762,6 @@ typedef struct XLogCtlData
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
-
 static XLogCtlData *XLogCtl = NULL;
 
 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -807,19 +772,6 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
  */
 static ControlFileData *ControlFile = NULL;
 
-#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
-
-
-/*
- * Cache of last written LSN for each relation chunk (hash bucket).
- * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
- * relation metadata update.
- * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
- * pages are replaced using LRU algorithm, based on L2-list.
- * Access to this cache is protected by 'LastWrittenLsnLock'.
- */
-static HTAB *lastWrittenLsnCache;
-
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
  * multiple evaluation!
@@ -5191,8 +5143,11 @@ LocalProcessControlFile(bool reset)
 	ReadControlFile();
 }
 
-static Size
-XLOGCtlShmemSize(void)
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
 {
 	Size		size;
 
@@ -5232,16 +5187,6 @@ XLOGCtlShmemSize(void)
 	return size;
 }
 
-/*
- * Initialization of shared memory for XLOG
- */
-Size
-XLOGShmemSize(void)
-{
-	return XLOGCtlShmemSize() +
-		hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry));
-}
-
 void
 XLOGShmemInit(void)
 {
@@ -5271,15 +5216,6 @@ XLOGShmemInit(void)
 	XLogCtl = (XLogCtlData *)
 		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
 
-	{
-		static HASHCTL info;
-		info.keysize = sizeof(LastWrittenLsnCacheKey);
-		info.entrysize = sizeof(LastWrittenLsnCacheEntry);
-		lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
-											lastWrittenLsnCacheSize, lastWrittenLsnCacheSize,
-											&info,
-											HASH_ELEM | HASH_BLOBS);
-	}
 	localControlFile = ControlFile;
 	ControlFile = (ControlFileData *)
 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
@@ -8177,8 +8113,7 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-	XLogCtl->maxLastWrittenLsn = EndOfLog;
-	XLogCtl->lastWrittenLsnLRU.next = XLogCtl->lastWrittenLsnLRU.prev = &XLogCtl->lastWrittenLsnLRU;
+	XLogCtl->lastWrittenPageLSN = EndOfLog;
 
 	LocalSetXLogInsertAllowed();
 
@@ -8954,144 +8889,29 @@ GetInsertRecPtr(void)
 }
 
 /*
- * GetLastWrittenLSN -- Returns maximal LSN of written page.
- * It returns an upper bound for the last written LSN of a given page,
- * either from a cached last written LSN or a global maximum last written LSN.
- * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
- * If cache is large enough ,iterting through all hash items may be rather expensive.
- * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
+ * GetLastWrittenPageLSN -- Returns maximal LSN of written page
  */
 XLogRecPtr
-GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
+GetLastWrittenPageLSN(void)
 {
 	XLogRecPtr lsn;
-	LastWrittenLsnCacheEntry* entry;
-
-	LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
-
-	/* Maximal last written LSN among all non-cached pages */
-	lsn = XLogCtl->maxLastWrittenLsn;
-
-	if (rnode != InvalidOid)
-	{
-		LastWrittenLsnCacheKey key;
-		key.relid = rnode;
-		key.bucket = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET;
-		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
-		if (entry != NULL)
-			lsn = entry->lsn;
-	}
-	else
-	{
-		HASH_SEQ_STATUS seq;
-		/* Find maximum of all cached LSNs */
-		hash_seq_init(&seq, lastWrittenLsnCache);
-		while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL)
-		{
-			if (entry->lsn > lsn)
-				lsn = entry->lsn;
-		}
-	}
-	LWLockRelease(LastWrittenLsnLock);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lsn = XLogCtl->lastWrittenPageLSN;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	return lsn;
 }
 
 /*
- * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
- * We maintain cache of last written LSNs with limited size and LRU replacement
- * policy. To reduce cache size we store max LSN not for each page, but for
- * bucket (1024 blocks). This cache allows to use old LSN when
- * requesting pages of unchanged or appended relations.
- *
- * rnode can be InvalidOid, in this case maxLastWrittenLsn is updated. 
- * SetLastWrittenLsn with InvalidOid
- * is used by createdb and dbase_redo functions.
- */
-void
-SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
-{
-	if (lsn == InvalidXLogRecPtr)
-		return;
-
-	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
-	if (rnode == InvalidOid)
-	{
-		if (lsn > XLogCtl->maxLastWrittenLsn)
-			XLogCtl->maxLastWrittenLsn = lsn;
-	}
-	else
-	{
-		LastWrittenLsnCacheEntry* entry;
-		LastWrittenLsnCacheKey key;
-		bool found;
-		BlockNumber bucket;
-
-		key.relid = rnode;
-		for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
-			 bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET;
-			 bucket++)
-		{
-			key.bucket = bucket;
-			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
-			if (found)
-			{
-				if (lsn > entry->lsn)
-					entry->lsn = lsn;
-				/* Unlink from LRU list */
-				entry->next->prev = entry->prev;
-				entry->prev->next = entry->next;
-			}
-			else
-			{
-				entry->lsn = lsn;
-				if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize)
-				{
-					/* Replace least recently used entry */
-					LastWrittenLsnCacheEntry* victim = XLogCtl->lastWrittenLsnLRU.prev;
-					/* Adjust max LSN for not cached relations/chunks if needed */
-					if (victim->lsn > XLogCtl->maxLastWrittenLsn)
-						XLogCtl->maxLastWrittenLsn = victim->lsn;
-
-					victim->next->prev = victim->prev;
-					victim->prev->next = victim->next;
-					hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL);
-				}
-			}
-			/* Link to the head of LRU list */
-			entry->next = XLogCtl->lastWrittenLsnLRU.next;
-			entry->prev = &XLogCtl->lastWrittenLsnLRU;
-			XLogCtl->lastWrittenLsnLRU.next = entry->next->prev = entry;
-		}
-	}
-	LWLockRelease(LastWrittenLsnLock);
-}
-
-/*
- * SetLastWrittenLSNForBlock -- Set maximal LSN for block
+ * SetLastWrittenPageLSN -- Set maximal LSN of written page
  */
 void
-SetLastWrittenLSNForBlock(XLogRecPtr lsn, Oid rnode, BlockNumber blkno)
+SetLastWrittenPageLSN(XLogRecPtr lsn)
 {
-	SetLastWrittenLSNForBlockRange(lsn, rnode, blkno, blkno);
-}
-
-/*
- * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
- */
-void
-SetLastWrittenLSNForRelation(XLogRecPtr lsn, Oid rnode)
-{
-	SetLastWrittenLSNForBlock(lsn, rnode, REL_METADATA_PSEUDO_BLOCKNO);
-}
-
-/*
- * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
- */
-void
-SetLastWrittenLSNForDatabase(XLogRecPtr lsn)
-{
-	SetLastWrittenLSNForBlock(lsn, InvalidOid, 0);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	if (lsn > XLogCtl->lastWrittenPageLSN)
+		XLogCtl->lastWrittenPageLSN = lsn;
+	SpinLockRelease(&XLogCtl->info_lck);
 }
 
 /*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 943cd8a696d..97a58988fb0 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -685,7 +685,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenLSNForDatabase(lsn);
+				SetLastWrittenPageLSN(lsn);
 			}
 		}
 		table_endscan(scan);
@@ -2363,7 +2363,8 @@ dbase_redo(XLogReaderState *record)
 		 */
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
-			SetLastWrittenLSNForDatabase(lsn);
+
+			SetLastWrittenPageLSN(lsn);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 8f9de58d158..a841beebf3f 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2058,13 +2058,6 @@ ProcessStandbyReply(XLogRecPtr	writePtr,
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 
-	/* 
-	 * walproposer use trunclateLsn instead of flushPtr for confirmed
-	 * received location, so we shouldn't update restart_lsn here.
-	 */
-	if (am_wal_proposer)
-		return;
-
 	/*
 	 * walproposer use trunclateLsn instead of flushPtr for confirmed
 	 * received location, so we shouldn't update restart_lsn here.
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index b4652c33ff6..6c7cf6c2956 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,4 +53,3 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
-LastWrittenLsnLock					48
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index cd2dd8c491f..500aa672611 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2379,16 +2379,6 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
-	{
-		{"lsn_cache_size", PGC_POSTMASTER, UNGROUPED,
-			gettext_noop("Size of las written LSN cache used by Neon."),
-			NULL
-		},
-		&lastWrittenLsnCacheSize,
-		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */
-		NULL, NULL, NULL
-	},
-
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 9cc214352f2..66fe9dfcd9e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -31,11 +31,6 @@ extern int	sync_method;
 
 extern PGDLLIMPORT TimeLineID ThisTimeLineID;	/* current TLI */
 
-/*
- * Pseudo block number used to associate LSN with relation metadata (relation size)
- */
-#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
-
 /*
  * Prior to 8.4, all activity during recovery was carried out by the startup
  * process. This local variable continues to be used in many parts of the
@@ -137,7 +132,6 @@ extern char *PrimaryConnInfo;
 extern char *PrimarySlotName;
 extern bool wal_receiver_create_temp_slot;
 extern bool track_wal_io_timing;
-extern int  lastWrittenLsnCacheSize;
 
 /* indirectly set via GUC system */
 extern TransactionId recoveryTargetXid;
@@ -357,11 +351,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, Oid relfilenode, BlockNumber blkno);
-extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, Oid relfilenode, BlockNumber from, BlockNumber till);
-extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn);
-extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, Oid relfilenode);
-extern XLogRecPtr GetLastWrittenLSN(Oid relfilenode, BlockNumber blkno);
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
+extern XLogRecPtr GetLastWrittenPageLSN(void);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From b1091eb6461457aa6b5139b983d37934101c4b1d Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Mon, 1 Aug 2022 22:27:48 +0300
Subject: [PATCH 155/214] Fix `uuid-ossp` build

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 126aa1d5e0d..11681c9cb16 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,7 @@ COPY . /pg/
 
 # Build and install Postgres locally
 RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --enable-uuid=ossp && \
+    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
     # Install main binaries and contribs
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
@@ -47,7 +47,7 @@ WORKDIR /pg
 FROM debian:buster-slim
 
 # libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install libreadline-dev
+RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
 
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \

From 6fa0782e230c8ab5756d205f728ff6a4e66b39a3 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Mon, 8 Aug 2022 13:25:34 +0200
Subject: [PATCH 156/214] Remove redundant const (#189)

---
 src/include/storage/smgr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 4a0d6b2e09b..5fcb2060007 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -132,7 +132,7 @@ extern void smgr_init_standard(void);
 extern void smgr_shutdown_standard(void);
 
 // Alternative implementation of calculate_database_size()
-typedef const int64 (*dbsize_hook_type) (Oid dbOid);
+typedef int64 (*dbsize_hook_type) (Oid dbOid);
 extern PGDLLIMPORT dbsize_hook_type dbsize_hook;
 
 typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode);

From cb8cd753bb78ad5f1aacd68340cb6ed3d964372c Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 12 Aug 2022 17:19:09 +0300
Subject: [PATCH 157/214] Eliminate UnkonwnXLogRecPtr and always use
 InvalidXLogRecPtr instead (#192)

* Eliminate UnkonwnXLogRecPtr and always use InvalidXLogRecPtr instead

* Remove GetMinReplicaLsn function
---
 src/backend/replication/walproposer.c |  9 ++++---
 src/backend/replication/walsender.c   | 38 +++------------------------
 src/include/access/xlogdefs.h         |  8 ------
 src/include/replication/walsender.h   |  1 -
 4 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4b0567be707..ef3b1917ff0 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1894,11 +1894,12 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 static XLogRecPtr
 CalculateMinFlushLsn(void)
 {
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_safekeepers; i++)
+	XLogRecPtr lsn = n_safekeepers > 0
+		? safekeeper[0].appendResponse.flushLsn
+		: InvalidXLogRecPtr;
+	for (int i = 1; i < n_safekeepers; i++)
 	{
-		if (safekeeper[i].appendResponse.flushLsn < lsn)
-			lsn = safekeeper[i].appendResponse.flushLsn;
+		lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn);
 	}
 	return lsn;
 }
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index a841beebf3f..e97184490c5 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3820,38 +3820,6 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 	return now - time;
 }
 
-/*
- * Get minimal write and flush LSN among all live replicas
- */
-void
-GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply_lsn)
-{
-	XLogRecPtr min_write_lsn = UnknownXLogRecPtr;
-	XLogRecPtr min_flush_lsn = UnknownXLogRecPtr;
-	XLogRecPtr min_apply_lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < max_wal_senders; i++)
-	{
-		WalSnd	   *walsnd = &WalSndCtl->walsnds[i];
-		if (walsnd->state == WALSNDSTATE_STREAMING)
-		{
-			/*
-			 * We assume that reads from walsnd->write/flush are atomic
-			 * on all modern x64 systems, as these fields are uint64 and
-			 * should be 8-bytes aligned.
-			 */
-			XLogRecPtr written = walsnd->write;
-			XLogRecPtr flushed = walsnd->flush;
-			XLogRecPtr applied = walsnd->apply;
-			min_write_lsn = Min(written, min_write_lsn);
-			min_flush_lsn = Min(flushed, min_flush_lsn);
-			min_apply_lsn = Min(applied, min_apply_lsn);
-		}
-	}
-	*write_lsn = min_write_lsn;
-	*flush_lsn = min_flush_lsn;
-	*apply_lsn = min_apply_lsn;
-}
-
 // Check if we need to suspend inserts because of lagging replication.
 uint64
 backpressure_lag(void)
@@ -3872,21 +3840,21 @@ backpressure_lag(void)
 			LSN_FORMAT_ARGS(flushPtr),
 			LSN_FORMAT_ARGS(applyPtr));
 
-		if ((writePtr != UnknownXLogRecPtr
+		if ((writePtr != InvalidXLogRecPtr
 			&& max_replication_write_lag > 0
 			&& myFlushLsn > writePtr + max_replication_write_lag*MB))
 		{
 			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
 		}
 
-		if ((flushPtr != UnknownXLogRecPtr
+		if ((flushPtr != InvalidXLogRecPtr
 			&& max_replication_flush_lag > 0
 			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
 		{
 			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
 		}
 
-		if ((applyPtr != UnknownXLogRecPtr
+		if ((applyPtr != InvalidXLogRecPtr
 			&& max_replication_apply_lag > 0
 			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
 		{
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index d44d5e64cdc..0940b64ca6b 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -28,14 +28,6 @@ typedef uint64 XLogRecPtr;
 #define InvalidXLogRecPtr	0
 #define XLogRecPtrIsInvalid(r)	((r) == InvalidXLogRecPtr)
 
-/*
- * Maximum possible XLogRecPtr value.  Currently used by back pressure
- * mechanism to distinguish the unknown replica flush/write position.
- * This significantly simplifies comparison and checks as we always
- * look for the minimal value.
- */
-#define UnknownXLogRecPtr	((XLogRecPtr) ~0)
-
 /*
  * First LSN to use for "fake" LSNs.
  *
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index fe21617994a..b597823c7c3 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -48,7 +48,6 @@ extern void WalSndInitStopping(void);
 extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
-extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply);
 extern uint64 backpressure_lag(void);
 /*
  * Remember that we want to wakeup walsenders later

From 56de5f27cf5677a3f55810ef0b5a0bef8189cb98 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 15 Aug 2022 15:05:46 +0300
Subject: [PATCH 158/214] Init wal redo buffer for fpi (#194)

* Initialize wal_redo_buffer after applying record with FPI

refer #1915

* Update comment

* Update src/backend/tcop/zenith_wal_redo.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>

* Update src/backend/tcop/zenith_wal_redo.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/tcop/zenith_wal_redo.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 96e1058c406..67653170bac 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -98,6 +98,12 @@ static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
+/*
+ * Buffer with target WAL redo page.
+ * We must not evict this page from the buffer pool, but we cannot just keep it pinned because
+ * some WAL redo functions expect the page to not be pinned. So we have a special check in
+ * localbuf.c to prevent this buffer from being evicted.
+ */
 Buffer		wal_redo_buffer;
 bool		am_wal_redo_postgres;
 
@@ -516,6 +522,7 @@ BeginRedoForBlock(StringInfo input_message)
 	rnode.dbNode = pq_getmsgint(input_message, 4);
 	rnode.relNode = pq_getmsgint(input_message, 4);
 	blknum = pq_getmsgint(input_message, 4);
+	wal_redo_buffer = InvalidBuffer;
 
 	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
 
@@ -624,7 +631,16 @@ ApplyRecord(StringInfo input_message)
 	redo_read_buffer_filter = redo_block_filter;
 
 	RmgrTable[record->xl_rmid].rm_redo(reader_state);
-
+	/*
+	 * If no base image of the page was provided by PushPage, initialize wal_redo_buffer here.
+	 * The first WAL record must initialize the page in that case.
+	 */
+	if (BufferIsInvalid(wal_redo_buffer))
+	{
+		wal_redo_buffer = ReadBufferWithoutRelcache(target_redo_tag.rnode, target_redo_tag.forkNum, target_redo_tag.blockNum, RBM_NORMAL, NULL);
+		Assert(!BufferIsInvalid(wal_redo_buffer));
+		ReleaseBuffer(wal_redo_buffer);
+	}
 	redo_read_buffer_filter = NULL;
 
 	/* Pop the error context stack */
@@ -714,6 +730,7 @@ GetPage(StringInfo input_message)
 	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
 
 	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_NORMAL, NULL);
+	Assert(buf == wal_redo_buffer);
 	page = BufferGetPage(buf);
 	/* single thread, so don't bother locking the page */
 
@@ -736,6 +753,7 @@ GetPage(StringInfo input_message)
 
 	ReleaseBuffer(buf);
 	DropRelFileNodeAllLocalBuffers(rnode);
+	wal_redo_buffer = InvalidBuffer;
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }

From 12b4ce32488a7ac8d27f0ef21965bc0b5acbda90 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sun, 14 Aug 2022 14:13:48 +0300
Subject: [PATCH 159/214] Stamp XLP_FIRST_IS_CONTRECORD only if we start
 writing with page offset.

Without this patch, on bootstrap XLP_FIRST_IS_CONTRECORD has been always put on
header of a page where WAL writing continues. This confuses WAL decoding on
safekeepers, making it think decoding starts in the middle of a record, leading
to

2022-08-12T17:48:13.816665Z ERROR {tid=37}: query handler for 'START_WAL_PUSH postgresql://no_user:@localhost:15050' failed: failed to run ReceiveWalConn

Caused by:
    0: failed to process ProposerAcceptorMessage
    1: invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD at 0/2CF8000
---
 src/backend/access/transam/xlog.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 4cf4e43b75b..311a6f54fbc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7874,14 +7874,20 @@ StartupXLOG(void)
 		{
 			int			offs = (EndRecPtr % XLOG_BLCKSZ);
 			XLogRecPtr	lastPage = EndRecPtr - offs;
+			int lastPageSize = ((lastPage % wal_segment_size) == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
 			int			idx = XLogRecPtrToBufIdx(lastPage);
 			XLogPageHeader xlogPageHdr = (XLogPageHeader) (XLogCtl->pages + idx * XLOG_BLCKSZ);
 
 			xlogPageHdr->xlp_pageaddr = lastPage;
 			xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
 			xlogPageHdr->xlp_tli = ThisTimeLineID;
-			xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD; // FIXME
-			xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
+			/*
+			 * If we start writing with offset from page beginning, pretend in
+			 * page header there is a record ending where actual data will
+			 * start.
+			 */
+			xlogPageHdr->xlp_rem_len = offs - lastPageSize;
+			xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0;
 			readOff = XLogSegmentOffset(lastPage, wal_segment_size);
 
 			elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));

From fa779a17d24a97d395374193a4cf53a7fcb3ceaa Mon Sep 17 00:00:00 2001
From: MMeent <boekewurm@gmail.com>
Date: Thu, 18 Aug 2022 15:50:13 +0200
Subject: [PATCH 160/214] Pull 99% of walproposer code into extension. (#188)

* Pull 99% of walproposer code into extension.

* Annotate nbytes to show it's used for asserts only, fixing one more warning.

* Fix makefiles:

- Include neon extensions into contrib Makefile
- Configure libpqwalproposer more like other extensions

* Add comment about lack of PG timelines, and make StartReplication static again.

* Fix some compiler warnings in vendor/postgres, and pull libpqwalproposer into vendor/neon

* Fix issue with makefile that didn't get caught in the normal test envs.
---
 contrib/Makefile                              |    2 +
 contrib/neon/Makefile                         |   10 +-
 contrib/neon/libpagestore.c                   |    9 +-
 .../neon}/libpqwalproposer.c                  |    9 +-
 contrib/neon/neon.c                           |   15 +-
 contrib/neon/neon.h                           |   19 +
 contrib/neon/pagestore_client.h               |    2 +-
 contrib/neon/pagestore_smgr.c                 |    2 +-
 .../neon}/walproposer.c                       |  223 ++--
 .../neon}/walproposer.h                       |   34 +-
 contrib/neon/walproposer_utils.c              | 1110 +++++++++++++++++
 contrib/neon/walproposer_utils.h              |   19 +
 src/Makefile                                  |    1 -
 src/backend/access/transam/xloginsert.c       |    2 +-
 src/backend/main/main.c                       |    2 +-
 src/backend/postmaster/bgworker.c             |    5 +-
 src/backend/postmaster/postmaster.c           |    7 +-
 src/backend/replication/Makefile              |    3 +-
 .../replication/libpqwalproposer/Makefile     |   37 -
 src/backend/replication/walpropcompat.c       |   96 ++
 src/backend/replication/walproposer_utils.c   |  402 ------
 src/backend/replication/walsender.c           |  255 ++--
 src/backend/storage/ipc/ipci.c                |    5 -
 src/backend/tcop/postgres.c                   |   19 +-
 src/backend/utils/misc/guc.c                  |   35 +-
 src/include/replication/walpropshim.h         |   19 +
 src/include/replication/walsender.h           |   20 +-
 27 files changed, 1559 insertions(+), 803 deletions(-)
 rename {src/backend/replication/libpqwalproposer => contrib/neon}/libpqwalproposer.c (98%)
 create mode 100644 contrib/neon/neon.h
 rename {src/backend/replication => contrib/neon}/walproposer.c (95%)
 rename {src/include/replication => contrib/neon}/walproposer.h (92%)
 create mode 100644 contrib/neon/walproposer_utils.c
 create mode 100644 contrib/neon/walproposer_utils.h
 delete mode 100644 src/backend/replication/libpqwalproposer/Makefile
 create mode 100644 src/backend/replication/walpropcompat.c
 delete mode 100644 src/backend/replication/walproposer_utils.c
 create mode 100644 src/include/replication/walpropshim.h

diff --git a/contrib/Makefile b/contrib/Makefile
index f27e458482e..9caec6cb81f 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -26,6 +26,8 @@ SUBDIRS = \
 		isn		\
 		lo		\
 		ltree		\
+		neon	\
+		neon_test_utils	\
 		oid2name	\
 		old_snapshot	\
 		pageinspect	\
diff --git a/contrib/neon/Makefile b/contrib/neon/Makefile
index b6f3cf400ff..d1f48a988c6 100644
--- a/contrib/neon/Makefile
+++ b/contrib/neon/Makefile
@@ -4,7 +4,14 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o neon.o
+	inmem_smgr.o \
+	libpagestore.o \
+	libpqwalproposer.o \
+	pagestore_smgr.o \
+	relsize_cache.o \
+	neon.o \
+	walproposer.o \
+	walproposer_utils.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
@@ -13,6 +20,7 @@ EXTENSION = neon
 DATA = neon--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
+
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c
index 2621421532a..15a4a769934 100644
--- a/contrib/neon/libpagestore.c
+++ b/contrib/neon/libpagestore.c
@@ -26,11 +26,10 @@
 #include "pgstat.h"
 #include "utils/guc.h"
 
-#include "replication/walproposer.h"
+#include "neon.h"
+#include "walproposer.h"
+#include "walproposer_utils.h"
 
-PG_MODULE_MAGIC;
-
-void		_PG_init(void);
 
 #define PageStoreTrace DEBUG5
 
@@ -355,7 +354,7 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
  * Module initialization function
  */
 void
-_PG_init(void)
+pg_init_libpagestore(void)
 {
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/contrib/neon/libpqwalproposer.c
similarity index 98%
rename from src/backend/replication/libpqwalproposer/libpqwalproposer.c
rename to contrib/neon/libpqwalproposer.c
index a12a2ee04bc..2b2b7a1a6a4 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/contrib/neon/libpqwalproposer.c
@@ -1,11 +1,8 @@
 #include "postgres.h"
 
-#include "replication/walproposer.h"
 #include "libpq-fe.h"
-
-/* Required for anything that's dynamically loaded */
-PG_MODULE_MAGIC;
-void _PG_init(void);
+#include "neon.h"
+#include "walproposer.h"
 
 /* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
 struct WalProposerConn
@@ -46,7 +43,7 @@ static WalProposerFunctionsType PQWalProposerFunctions = {
 
 /* Module initialization */
 void
-_PG_init(void)
+pg_init_libpqwalproposer(void)
 {
 	if (WalProposerFunctions != NULL)
 		elog(ERROR, "libpqwalproposer already loaded");
diff --git a/contrib/neon/neon.c b/contrib/neon/neon.c
index c7c176dba7a..94ff9851eae 100644
--- a/contrib/neon/neon.c
+++ b/contrib/neon/neon.c
@@ -17,11 +17,24 @@
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
 #include "replication/walsender.h"
-#include "replication/walproposer.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 
+#include "neon.h"
+#include "walproposer.h"
+
+PG_MODULE_MAGIC;
+void		_PG_init(void);
+
+
+void		_PG_init(void)
+{
+	pg_init_libpagestore();
+	pg_init_libpqwalproposer();
+	pg_init_walproposer();
+}
+
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 
diff --git a/contrib/neon/neon.h b/contrib/neon/neon.h
new file mode 100644
index 00000000000..2c66bc7bf05
--- /dev/null
+++ b/contrib/neon/neon.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon.h
+ *	  Functions used in the initialization of this extension.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/neon.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef NEON_H
+#define NEON_H
+
+extern void pg_init_libpagestore(void);
+extern void pg_init_libpqwalproposer(void);
+extern void pg_init_walproposer(void);
+
+#endif /* NEON_H */
diff --git a/contrib/neon/pagestore_client.h b/contrib/neon/pagestore_client.h
index 93ea6771eb9..f79a3c9142f 100644
--- a/contrib/neon/pagestore_client.h
+++ b/contrib/neon/pagestore_client.h
@@ -182,7 +182,7 @@ extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
 extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 							 BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
-extern int64 zenith_dbsize(Oid dbNode);
+extern const int64 zenith_dbsize(Oid dbNode);
 extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber nblocks);
 extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 5fdfea5e487..3e1b74dba7c 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -1336,7 +1336,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 /*
  *	zenith_db_size() -- Get the size of the database in bytes.
  */
-int64
+const int64
 zenith_dbsize(Oid dbNode)
 {
 	ZenithResponse *resp;
diff --git a/src/backend/replication/walproposer.c b/contrib/neon/walproposer.c
similarity index 95%
rename from src/backend/replication/walproposer.c
rename to contrib/neon/walproposer.c
index ef3b1917ff0..245b45727e5 100644
--- a/src/backend/replication/walproposer.c
+++ b/contrib/neon/walproposer.c
@@ -38,7 +38,6 @@
 #include <sys/stat.h>
 #include "access/xlogdefs.h"
 #include "access/xlogutils.h"
-#include "replication/walproposer.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -51,11 +50,21 @@
 #include "postmaster/postmaster.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
+#include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/timestamp.h"
 
+#include "neon.h"
+#include "walproposer.h"
+#include "walproposer_utils.h"
+#include "replication/walpropshim.h"
+
 
 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
@@ -102,14 +111,11 @@ static int	n_votes = 0;
 static int	n_connected = 0;
 static TimestampTz last_reconnect_attempt;
 
-/* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
-static bool syncSafekeepers;
-
 static WalproposerShmemState *walprop_shared;
 
 /* Prototypes for private functions */
-static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
-static void WalProposerStart(void);
+static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStartImpl(void);
 static void WalProposerLoop(void);
 static void InitEventSet(void);
 static void UpdateEventSet(Safekeeper *sk, uint32 events);
@@ -150,6 +156,88 @@ static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperSta
 static bool AsyncFlush(Safekeeper *sk);
 
 
+static void nwp_shmem_startup_hook(void);
+static void nwp_register_gucs(void);
+static void nwp_prepare_shmem(void);
+static uint64 backpressure_lag_impl(void);
+
+
+static shmem_startup_hook_type prev_shmem_startup_hook_type;
+
+
+
+void pg_init_walproposer(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	nwp_register_gucs();
+
+	nwp_prepare_shmem();
+
+	delay_backend_us = &backpressure_lag_impl;
+
+	WalProposerRegister();
+	
+	WalProposerInit = &WalProposerInitImpl;
+	WalProposerStart = &WalProposerStartImpl;
+}
+
+static void nwp_register_gucs(void)
+{
+	DefineCustomStringVariable(
+		"neon.safekeepers",
+		"List of Neon WAL acceptors (host:port)",
+		NULL, /* long_desc */
+		&wal_acceptors_list, /* valueAddr */
+		"", /* bootValue */
+		PGC_POSTMASTER,
+		GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */
+		NULL, NULL, NULL
+	);
+
+	DefineCustomIntVariable(
+		"neon.safekeeper_reconnect_timeout",
+		"Timeout for reconnecting to offline wal acceptor.",
+		NULL,
+		&wal_acceptor_reconnect_timeout,
+		1000, 0, INT_MAX, /* default, min, max */
+		PGC_SIGHUP, /* context */
+		GUC_UNIT_MS, /* flags */
+		NULL, NULL, NULL
+	);
+
+	DefineCustomIntVariable(
+		"neon.safekeeper_connect_timeout",
+		"Timeout after which give up connection attempt to safekeeper.",
+		NULL,
+		&wal_acceptor_connect_timeout,
+		5000, 0, INT_MAX,
+		PGC_SIGHUP,
+		GUC_UNIT_MS,
+		NULL, NULL, NULL
+	);
+	
+}
+
+/* shmem handling */
+
+static void nwp_prepare_shmem(void)
+{
+	RequestAddinShmemSpace(WalproposerShmemSize());
+
+	prev_shmem_startup_hook_type = shmem_startup_hook;
+	shmem_startup_hook = nwp_shmem_startup_hook;
+}
+
+static void nwp_shmem_startup_hook(void)
+{
+	if (prev_shmem_startup_hook_type)
+		prev_shmem_startup_hook_type();
+
+	WalproposerShmemInit();
+}
+
 /*
  * WAL proposer bgworker entry point.
  */
@@ -190,75 +278,6 @@ WalProposerMain(Datum main_arg)
 	WalProposerStart();
 }
 
-/*
- * Entry point for `postgres --sync-safekeepers`.
- */
-void
-WalProposerSync(int argc, char *argv[])
-{
-	struct stat stat_buf;
-
-	syncSafekeepers = true;
-	ThisTimeLineID = 1;
-
-	InitStandaloneProcess(argv[0]);
-
-	SetProcessingMode(InitProcessing);
-
-	/*
-	 * Set default values for command-line options.
-	 */
-	InitializeGUCOptions();
-
-	/* Acquire configuration parameters */
-	if (!SelectConfigFiles(NULL, progname))
-		exit(1);
-
-	/*
-	 * Imitate we are early in bootstrap loading shared_preload_libraries;
-	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
-	 */
-	process_shared_preload_libraries_in_progress = true;
-
-	/*
-	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
-	 *
-	 * Copied from InitPostmasterDeathWatchHandle()
-	 */
-	if (pipe(postmaster_alive_fds) < 0)
-		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
-	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
-		ereport(FATAL,
-				(errcode_for_socket_access(),
-				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
-
-	ChangeToDataDir();
-
-	/* Create pg_wal directory, if it doesn't exist */
-	if (stat(XLOGDIR, &stat_buf) != 0)
-	{
-		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
-		if (MakePGDirectory(XLOGDIR) < 0)
-		{
-			ereport(ERROR,
-					(errcode_for_file_access(),
-						errmsg("could not create directory \"%s\": %m",
-							XLOGDIR)));
-			exit(1);
-		}
-	}
-
-	WalProposerInit(0, 0);
-
-	process_shared_preload_libraries_in_progress = false;
-
-	BackgroundWorkerUnblockSignals();
-
-	WalProposerStart();
-}
-
 /*
  * Create new AppendRequest message and start sending it. This function is
  * called from walsender every time the new WAL is available.
@@ -362,7 +381,7 @@ WalProposerRegister(void)
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
 	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
 	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
 	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
@@ -374,21 +393,19 @@ WalProposerRegister(void)
 }
 
 static void
-WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
+WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
 {
 	char	   *host;
 	char	   *sep;
 	char	   *port;
 
 	/* Load the libpq-specific functions */
-	load_file("libpqwalproposer", false);
 	if (WalProposerFunctions == NULL)
 		elog(ERROR, "libpqwalproposer didn't initialize correctly");
 
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-	load_file("neon", false);
 
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
@@ -454,7 +471,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 }
 
 static void
-WalProposerStart(void)
+WalProposerStartImpl(void)
 {
 
 	/* Initiate connections to all safekeeper nodes */
@@ -1531,7 +1548,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = greetRequest.timeline;
 	cmd.startpoint = startpos;
-	StartReplication(&cmd);
+	StartProposerReplication(&cmd);
 }
 
 /*
@@ -2349,3 +2366,47 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 	}
 }
+
+// Check if we need to suspend inserts because of lagging replication.
+static uint64
+backpressure_lag_impl(void)
+{
+	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
+	{
+		XLogRecPtr writePtr;
+		XLogRecPtr flushPtr;
+		XLogRecPtr applyPtr;
+		XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+#define MB ((XLogRecPtr)1024*1024)
+
+		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
+			 LSN_FORMAT_ARGS(myFlushLsn),
+			 LSN_FORMAT_ARGS(writePtr),
+			 LSN_FORMAT_ARGS(flushPtr),
+			 LSN_FORMAT_ARGS(applyPtr));
+
+		if ((writePtr != InvalidXLogRecPtr
+			 && max_replication_write_lag > 0
+			 && myFlushLsn > writePtr + max_replication_write_lag*MB))
+		{
+			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
+		}
+
+		if ((flushPtr != InvalidXLogRecPtr
+			 && max_replication_flush_lag > 0
+			 && myFlushLsn > flushPtr + max_replication_flush_lag*MB))
+		{
+			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
+		}
+
+		if ((applyPtr != InvalidXLogRecPtr
+			 && max_replication_apply_lag > 0
+			 && myFlushLsn > applyPtr + max_replication_apply_lag*MB))
+		{
+			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
+		}
+	}
+	return 0;
+}
diff --git a/src/include/replication/walproposer.h b/contrib/neon/walproposer.h
similarity index 92%
rename from src/include/replication/walproposer.h
rename to contrib/neon/walproposer.h
index c5a5b76268e..3dd8bc19191 100644
--- a/src/include/replication/walproposer.h
+++ b/contrib/neon/walproposer.h
@@ -1,5 +1,5 @@
-#ifndef __WALPROPOSER_H__
-#define __WALPROPOSER_H__
+#ifndef __NEON_WALPROPOSER_H__
+#define __NEON_WALPROPOSER_H__
 
 #include "access/xlogdefs.h"
 #include "postgres.h"
@@ -361,37 +361,13 @@ typedef struct Safekeeper
 } Safekeeper;
 
 
-int        CompareLsn(const void *a, const void *b);
-char*      FormatSafekeeperState(SafekeeperState state);
-void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
-uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
-char*      FormatEvents(uint32 events);
-void       WalProposerMain(Datum main_arg);
+extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-bool       HexDecodeString(uint8 *result, char *input, int nbytes);
-uint32     pq_getmsgint32_le(StringInfo msg);
-uint64     pq_getmsgint64_le(StringInfo msg);
-void       pq_sendint32_le(StringInfo buf, uint32 i);
-void       pq_sendint64_le(StringInfo buf, uint64 i);
 void       WalProposerPoll(void);
 void       WalProposerRegister(void);
-void       XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
-void       XLogWalPropClose(XLogRecPtr recptr);
-void       ProcessStandbyReply(XLogRecPtr	writePtr,
-							   XLogRecPtr	flushPtr,
-							   XLogRecPtr	applyPtr,
-							   TimestampTz replyTime,
-							   bool		replyRequested);
-void       PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
-void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
-									TransactionId feedbackXmin,
-									uint32		feedbackEpoch,
-									TransactionId feedbackCatalogXmin,
-									uint32		feedbackCatalogEpoch);
 void ParseReplicationFeedbackMessage(StringInfo reply_message,
 								ReplicationFeedback *rf);
-void       StartReplication(StartReplicationCmd *cmd);
-void       WalProposerSync(int argc, char *argv[]);
+extern void StartProposerReplication(StartReplicationCmd *cmd);
 
 Size WalproposerShmemSize(void);
 bool WalproposerShmemInit(void);
@@ -562,4 +538,4 @@ typedef struct WalProposerFunctionsType
  */
 extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
 
-#endif
+#endif /* __NEON_WALPROPOSER_H__ */
diff --git a/contrib/neon/walproposer_utils.c b/contrib/neon/walproposer_utils.c
new file mode 100644
index 00000000000..cd8fd556c22
--- /dev/null
+++ b/contrib/neon/walproposer_utils.c
@@ -0,0 +1,1110 @@
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlogutils.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include "funcapi.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "walproposer_utils.h"
+#include "replication/walsender_private.h"
+
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+
+#include "../../src/interfaces/libpq/libpq-fe.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+/*
+ * These variables are used similarly to openLogFile/SegNo,
+ * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
+ * corresponding the filename of walpropFile.
+ */
+static int	walpropFile = -1;
+static TimeLineID walpropFileTLI = 0;
+static XLogSegNo walpropSegNo = 0;
+
+/* START cloned file-local variables and functions from walsender.c */
+
+/*
+ * xlogreader used for replication.  Note that a WAL sender doing physical
+ * replication does not need xlogreader to read WAL, but it needs one to
+ * keep a state of its work.
+ */
+static XLogReaderState *xlogreader = NULL;
+
+/*
+ * These variables keep track of the state of the timeline we're currently
+ * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
+ * the timeline is not the latest timeline on this server, and the server's
+ * history forked off from that timeline at sendTimeLineValidUpto.
+ */
+static TimeLineID sendTimeLine = 0;
+static TimeLineID sendTimeLineNextTLI = 0;
+static bool sendTimeLineIsHistoric = false;
+static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
+
+/*
+ * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
+ * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
+ */
+static TimestampTz last_reply_timestamp = 0;
+
+/* Have we sent a heartbeat message asking for reply, since last reply? */
+static bool waiting_for_ping_response = false;
+
+static bool streamingDoneSending;
+static bool streamingDoneReceiving;
+
+/* Are we there yet? */
+static bool WalSndCaughtUp = false;
+
+/* Flags set by signal handlers for later service in main loop */
+static volatile sig_atomic_t got_STOPPING = false;
+
+/*
+ * How far have we sent WAL already? This is also advertised in
+ * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
+ */
+static XLogRecPtr sentPtr = InvalidXLogRecPtr;
+
+/*
+ * This is set while we are streaming. When not set
+ * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
+ * the main loop is responsible for checking got_STOPPING and terminating when
+ * it's set (after streaming any remaining WAL).
+ */
+static volatile sig_atomic_t replication_active = false;
+
+typedef void (*WalSndSendDataCallback) (void);
+static void WalSndLoop(WalSndSendDataCallback send_data);
+static void XLogSendPhysical(void);
+static XLogRecPtr GetStandbyFlushRecPtr(void);
+
+static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+							  TimeLineID *tli_p);
+
+/* END cloned file-level variables and functions from walsender.c */
+
+int
+CompareLsn(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return -1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return 1;
+}
+
+/* Returns a human-readable string corresonding to the SafekeeperState
+ *
+ * The string should not be freed.
+ *
+ * The strings are intended to be used as a prefix to "state", e.g.:
+ *
+ *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *
+ * If this sort of phrasing doesn't fit the message, instead use something like:
+ *
+ *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ */
+char*
+FormatSafekeeperState(SafekeeperState state)
+{
+	char* return_val = NULL;
+
+	switch (state)
+	{
+		case SS_OFFLINE:
+			return_val = "offline";
+			break;
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
+			return_val = "connecting";
+			break;
+		case SS_WAIT_EXEC_RESULT:
+			return_val = "receiving query result";
+			break;
+		case SS_HANDSHAKE_RECV:
+			return_val = "handshake (receiving)";
+			break;
+		case SS_VOTING:
+			return_val = "voting";
+			break;
+		case SS_WAIT_VERDICT:
+			return_val = "wait-for-verdict";
+			break;
+		case SS_SEND_ELECTED_FLUSH:
+			return_val = "send-announcement-flush";
+			break;
+		case SS_IDLE:
+			return_val = "idle";
+			break;
+		case SS_ACTIVE:
+			return_val = "active";
+			break;
+	}
+
+	Assert(return_val != NULL);
+
+	return return_val;
+}
+
+/* Asserts that the provided events are expected for given safekeeper's state */
+void
+AssertEventsOkForState(uint32 events, Safekeeper* sk)
+{
+	uint32 expected = SafekeeperStateDesiredEvents(sk->state);
+
+	/* The events are in-line with what we're expecting, under two conditions:
+	 *   (a) if we aren't expecting anything, `events` has no read- or
+	 *       write-ready component.
+	 *   (b) if we are expecting something, there's overlap
+	 *       (i.e. `events & expected != 0`)
+	 */
+	bool events_ok_for_state; /* long name so the `Assert` is more clear later */
+
+	if (expected == WL_NO_EVENTS)
+		events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0);
+	else
+		events_ok_for_state = ((events & expected) != 0);
+
+	if (!events_ok_for_state)
+	{
+		/* To give a descriptive message in the case of failure, we use elog and
+		 * then an assertion that's guaranteed to fail. */
+		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+		Assert(events_ok_for_state);
+	}
+}
+
+/* Returns the set of events a safekeeper in this state should be waiting on
+ *
+ * This will return WL_NO_EVENTS (= 0) for some events. */
+uint32
+SafekeeperStateDesiredEvents(SafekeeperState state)
+{
+	uint32 result = WL_NO_EVENTS;
+
+	/* If the state doesn't have a modifier, we can check the base state */
+	switch (state)
+	{
+		/* Connecting states say what they want in the name */
+		case SS_CONNECTING_READ:
+			result = WL_SOCKET_READABLE;
+			break;
+		case SS_CONNECTING_WRITE:
+			result = WL_SOCKET_WRITEABLE;
+			break;
+
+		/* Reading states need the socket to be read-ready to continue */
+		case SS_WAIT_EXEC_RESULT:
+		case SS_HANDSHAKE_RECV:
+		case SS_WAIT_VERDICT:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* Idle states use read-readiness as a sign that the connection has been
+		 * disconnected. */
+		case SS_VOTING:
+		case SS_IDLE:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* 
+		 * Flush states require write-ready for flushing.
+		 * Active state does both reading and writing.
+		 * 
+		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
+		 * 	check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
+		 */
+		case SS_SEND_ELECTED_FLUSH:
+		case SS_ACTIVE:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
+		/* The offline state expects no events. */
+		case SS_OFFLINE:
+			result = WL_NO_EVENTS;
+			break;
+
+		default:
+			Assert(false);
+			break;
+	}
+
+	return result;
+}
+
+/* Returns a human-readable string corresponding to the event set
+ *
+ * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
+ * returned string may be meaingless.
+ *
+ * The string should not be freed. It should also not be expected to remain the same between
+ * function calls. */
+char*
+FormatEvents(uint32 events)
+{
+	static char return_str[8];
+
+	/* Helper variable to check if there's extra bits */
+	uint32 all_flags = WL_LATCH_SET
+		| WL_SOCKET_READABLE
+		| WL_SOCKET_WRITEABLE
+		| WL_TIMEOUT
+		| WL_POSTMASTER_DEATH
+		| WL_EXIT_ON_PM_DEATH
+		| WL_SOCKET_CONNECTED;
+
+	/* The formatting here isn't supposed to be *particularly* useful -- it's just to give an
+	 * sense of what events have been triggered without needing to remember your powers of two. */
+
+	return_str[0] = (events & WL_LATCH_SET       ) ? 'L' : '_';
+	return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_';
+	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
+	return_str[3] = (events & WL_TIMEOUT         ) ? 'T' : '_';
+	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
+	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
+	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
+
+	if (events & (~all_flags))
+	{
+		elog(WARNING, "Event formatting found unexpected component %d",
+				events & (~all_flags));
+		return_str[6] = '*';
+		return_str[7] = '\0';
+	}
+	else
+		return_str[6] = '\0';
+
+	return (char *) &return_str;
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
+
+/*
+ * Write XLOG data to disk.
+ */
+void
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
+{
+	int			startoff;
+	int			byteswritten;
+
+	while (nbytes > 0)
+	{
+		int			segbytes;
+
+		/* Close the current segment if it's completed */
+		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+			XLogWalPropClose(recptr);
+
+		if (walpropFile < 0)
+		{
+			bool		use_existent = true;
+
+			/* Create/use new log file */
+			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
+			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
+			walpropFileTLI = ThisTimeLineID;
+		}
+
+		/* Calculate the start offset of the received logs */
+		startoff = XLogSegmentOffset(recptr, wal_segment_size);
+
+		if (startoff + nbytes > wal_segment_size)
+			segbytes = wal_segment_size - startoff;
+		else
+			segbytes = nbytes;
+
+		/* OK to write the logs */
+		errno = 0;
+
+		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
+		if (byteswritten <= 0)
+		{
+			char		xlogfname[MAXFNAMELEN];
+			int			save_errno;
+
+			/* if write didn't set errno, assume no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+
+			save_errno = errno;
+			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+			errno = save_errno;
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not write to log segment %s "
+							"at offset %u, length %lu: %m",
+							xlogfname, startoff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for write */
+		recptr += byteswritten;
+
+		nbytes -= byteswritten;
+		buf += byteswritten;
+	}
+
+	/*
+	 * Close the current segment if it's fully written up in the last cycle of
+	 * the loop.
+	 */
+	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+	{
+		XLogWalPropClose(recptr);
+	}
+}
+
+/*
+ * Close the current segment.
+ */
+void
+XLogWalPropClose(XLogRecPtr recptr)
+{
+	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
+
+	if (close(walpropFile) != 0)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close log segment %s: %m",
+						xlogfname)));
+	}
+
+	walpropFile = -1;
+}
+
+/* START of cloned functions from walsender.c */
+
+/*
+ * Handle START_REPLICATION command.
+ *
+ * At the moment, this never returns, but an ereport(ERROR) will take us back
+ * to the main loop.
+ */
+void
+StartProposerReplication(StartReplicationCmd *cmd)
+{
+	XLogRecPtr	FlushPtr;
+
+	if (ThisTimeLineID == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
+
+	/* create xlogreader for physical replication */
+	xlogreader =
+		XLogReaderAllocate(wal_segment_size, NULL,
+						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
+									  .segment_close = wal_segment_close),
+						   NULL);
+
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+					errmsg("out of memory")));
+
+	/*
+	 * We assume here that we're logging enough information in the WAL for
+	 * log-shipping, since this is checked in PostmasterMain().
+	 *
+	 * NOTE: wal_level can only change at shutdown, so in most cases it is
+	 * difficult for there to be WAL data that we can still see that was
+	 * written at wal_level='minimal'.
+	 */
+
+	if (cmd->slotname)
+	{
+		ReplicationSlotAcquire(cmd->slotname, true);
+		if (SlotIsLogical(MyReplicationSlot))
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						errmsg("cannot use a logical replication slot for physical replication")));
+
+		/*
+		 * We don't need to verify the slot's restart_lsn here; instead we
+		 * rely on the caller requesting the starting point to use.  If the
+		 * WAL segment doesn't exist, we'll fail later.
+		 */
+	}
+
+	/*
+	 * Select the timeline. If it was given explicitly by the client, use
+	 * that. Otherwise use the timeline of the last replayed record, which is
+	 * kept in ThisTimeLineID.
+	 * 
+	 * Neon doesn't currently use PG Timelines, but it may in the future, so
+	 * we keep this code around to lighten the load for when we need it.
+	 */
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr();
+	}
+	else
+		FlushPtr = GetFlushRecPtr();
+
+	if (cmd->timeline != 0)
+	{
+		XLogRecPtr	switchpoint;
+
+		sendTimeLine = cmd->timeline;
+		if (sendTimeLine == ThisTimeLineID)
+		{
+			sendTimeLineIsHistoric = false;
+			sendTimeLineValidUpto = InvalidXLogRecPtr;
+		}
+		else
+		{
+			List	   *timeLineHistory;
+
+			sendTimeLineIsHistoric = true;
+
+			/*
+			 * Check that the timeline the client requested exists, and the
+			 * requested start location is on that timeline.
+			 */
+			timeLineHistory = readTimeLineHistory(ThisTimeLineID);
+			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
+										 &sendTimeLineNextTLI);
+			list_free_deep(timeLineHistory);
+
+			/*
+			 * Found the requested timeline in the history. Check that
+			 * requested startpoint is on that timeline in our history.
+			 *
+			 * This is quite loose on purpose. We only check that we didn't
+			 * fork off the requested timeline before the switchpoint. We
+			 * don't check that we switched *to* it before the requested
+			 * starting point. This is because the client can legitimately
+			 * request to start replication from the beginning of the WAL
+			 * segment that contains switchpoint, but on the new timeline, so
+			 * that it doesn't end up with a partial segment. If you ask for
+			 * too old a starting point, you'll get an error later when we
+			 * fail to find the requested WAL segment in pg_wal.
+			 *
+			 * XXX: we could be more strict here and only allow a startpoint
+			 * that's older than the switchpoint, if it's still in the same
+			 * WAL segment.
+			 */
+			if (!XLogRecPtrIsInvalid(switchpoint) &&
+				switchpoint < cmd->startpoint)
+			{
+				ereport(ERROR,
+						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
+								LSN_FORMAT_ARGS(cmd->startpoint),
+								cmd->timeline),
+							errdetail("This server's history forked from timeline %u at %X/%X.",
+									  cmd->timeline,
+									  LSN_FORMAT_ARGS(switchpoint))));
+			}
+			sendTimeLineValidUpto = switchpoint;
+		}
+	}
+	else
+	{
+		sendTimeLine = ThisTimeLineID;
+		sendTimeLineValidUpto = InvalidXLogRecPtr;
+		sendTimeLineIsHistoric = false;
+	}
+
+	streamingDoneSending = streamingDoneReceiving = false;
+
+	/* If there is nothing to stream, don't even enter COPY mode */
+	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
+	{
+		/*
+		 * When we first start replication the standby will be behind the
+		 * primary. For some applications, for example synchronous
+		 * replication, it is important to have a clear state for this initial
+		 * catchup mode, so we can trigger actions when we change streaming
+		 * state later. We may stay in this state for a long time, which is
+		 * exactly why we want to be able to monitor whether or not we are
+		 * still here.
+		 */
+		WalSndSetState(WALSNDSTATE_CATCHUP);
+
+		/*
+		 * Don't allow a request to stream from a future point in WAL that
+		 * hasn't been flushed to disk in this server yet.
+		 */
+		if (FlushPtr < cmd->startpoint)
+		{
+			ereport(ERROR,
+					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+							LSN_FORMAT_ARGS(cmd->startpoint),
+							LSN_FORMAT_ARGS(FlushPtr))));
+		}
+
+		/* Start streaming from the requested point */
+		sentPtr = cmd->startpoint;
+
+		/* Initialize shared memory status, too */
+		SpinLockAcquire(&MyWalSnd->mutex);
+		MyWalSnd->sentPtr = sentPtr;
+		SpinLockRelease(&MyWalSnd->mutex);
+
+		SyncRepInitConfig();
+
+		/* Main loop of walsender */
+		replication_active = true;
+
+		WalSndLoop(XLogSendPhysical);
+
+		replication_active = false;
+		if (got_STOPPING)
+			proc_exit(0);
+		WalSndSetState(WALSNDSTATE_STARTUP);
+
+		Assert(streamingDoneSending && streamingDoneReceiving);
+	}
+
+	if (cmd->slotname)
+		ReplicationSlotRelease();
+
+	/*
+	 * Copy is finished now. Send a single-row result set indicating the next
+	 * timeline.
+	 */
+	if (sendTimeLineIsHistoric)
+	{
+		char		startpos_str[8 + 1 + 8 + 1];
+		DestReceiver *dest;
+		TupOutputState *tstate;
+		TupleDesc	tupdesc;
+		Datum		values[2];
+		bool		nulls[2];
+
+		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
+				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
+
+		dest = CreateDestReceiver(DestRemoteSimple);
+		MemSet(nulls, false, sizeof(nulls));
+
+		/*
+		 * Need a tuple descriptor representing two columns. int8 may seem
+		 * like a surprising data type for this, but in theory int4 would not
+		 * be wide enough for this, as TimeLineID is unsigned.
+		 */
+		tupdesc = CreateTemplateTupleDesc(2);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
+								  INT8OID, -1, 0);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
+								  TEXTOID, -1, 0);
+
+		/* prepare for projection of tuple */
+		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
+
+		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
+		values[1] = CStringGetTextDatum(startpos_str);
+
+		/* send it to dest */
+		do_tup_output(tstate, values, nulls);
+
+		end_tup_output(tstate);
+	}
+
+	/* Send CommandComplete message */
+	EndReplicationCommand("START_STREAMING");
+}
+
+/*
+ * Returns the latest point in WAL that has been safely flushed to disk, and
+ * can be sent to the standby. This should only be called when in recovery,
+ * ie. we're streaming to a cascaded standby.
+ *
+ * As a side-effect, ThisTimeLineID is updated to the TLI of the last
+ * replayed WAL record.
+ */
+static XLogRecPtr
+GetStandbyFlushRecPtr(void)
+{
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	receivePtr;
+	TimeLineID	receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that it
+	 * has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	ThisTimeLineID = replayTLI;
+
+	result = replayPtr;
+	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+
+/* XLogReaderRoutine->segment_open callback */
+static void
+WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+				  TimeLineID *tli_p)
+{
+	char		path[MAXPGPATH];
+
+	/*-------
+	 * When reading from a historic timeline, and there is a timeline switch
+	 * within this segment, read from the WAL segment belonging to the new
+	 * timeline.
+	 *
+	 * For example, imagine that this server is currently on timeline 5, and
+	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
+	 * 0/13002088. In pg_wal, we have these files:
+	 *
+	 * ...
+	 * 000000040000000000000012
+	 * 000000040000000000000013
+	 * 000000050000000000000013
+	 * 000000050000000000000014
+	 * ...
+	 *
+	 * In this situation, when requested to send the WAL from segment 0x13, on
+	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
+	 * recovery prefers files from newer timelines, so if the segment was
+	 * restored from the archive on this server, the file belonging to the old
+	 * timeline, 000000040000000000000013, might not exist. Their contents are
+	 * equal up to the switchpoint, because at a timeline switch, the used
+	 * portion of the old segment is copied to the new file.  -------
+	 */
+	*tli_p = sendTimeLine;
+	if (sendTimeLineIsHistoric)
+	{
+		XLogSegNo	endSegNo;
+
+		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
+		if (nextSegNo == endSegNo)
+			*tli_p = sendTimeLineNextTLI;
+	}
+
+	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return;
+
+	/*
+	 * If the file is not found, assume it's because the standby asked for a
+	 * too old WAL segment that has already been removed or recycled.
+	 */
+	if (errno == ENOENT)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+					errmsg("requested WAL segment %s has already been removed",
+						   xlogfname)));
+	}
+	else
+		ereport(ERROR,
+				(errcode_for_file_access(),
+					errmsg("could not open file \"%s\": %m",
+						   path)));
+}
+
+
+/* Main loop of walsender process that streams the WAL over Copy messages. */
+static void
+WalSndLoop(WalSndSendDataCallback send_data)
+{
+	/*
+	 * Initialize the last reply timestamp. That enables timeout processing
+	 * from hereon.
+	 */
+	last_reply_timestamp = GetCurrentTimestamp();
+	waiting_for_ping_response = false;
+
+	/*
+	 * Loop until we reach the end of this timeline or the client requests to
+	 * stop streaming.
+	 */
+	for (;;)
+	{
+		/* Clear any already-pending wakeups */
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Process any requests or signals received recently */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			SyncRepInitConfig();
+		}
+
+		/* always true */
+		if (am_wal_proposer)
+		{
+			send_data();
+			if (WalSndCaughtUp)
+			{
+				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+					WalSndSetState(WALSNDSTATE_STREAMING);
+				WalProposerPoll();
+				WalSndCaughtUp = false;
+			}
+			continue;
+		}
+	}
+}
+
+/*
+ * Send out the WAL in its normal physical/stored form.
+ *
+ * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
+ * but not yet sent to the client, and buffer it in the libpq output
+ * buffer.
+ *
+ * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
+ * otherwise WalSndCaughtUp is set to false.
+ */
+static void
+XLogSendPhysical(void)
+{
+	XLogRecPtr	SendRqstPtr;
+	XLogRecPtr	startptr;
+	XLogRecPtr	endptr;
+	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
+
+	/* If requested switch the WAL sender to the stopping state. */
+	if (got_STOPPING)
+		WalSndSetState(WALSNDSTATE_STOPPING);
+
+	if (streamingDoneSending)
+	{
+		WalSndCaughtUp = true;
+		return;
+	}
+
+	/* Figure out how far we can safely send the WAL. */
+	if (sendTimeLineIsHistoric)
+	{
+		/*
+		 * Streaming an old timeline that's in this server's history, but is
+		 * not the one we're currently inserting or replaying. It can be
+		 * streamed up to the point where we switched off that timeline.
+		 */
+		SendRqstPtr = sendTimeLineValidUpto;
+	}
+	else if (am_cascading_walsender)
+	{
+		/*
+		 * Streaming the latest timeline on a standby.
+		 *
+		 * Attempt to send all WAL that has already been replayed, so that we
+		 * know it's valid. If we're receiving WAL through streaming
+		 * replication, it's also OK to send any WAL that has been received
+		 * but not replayed.
+		 *
+		 * The timeline we're recovering from can change, or we can be
+		 * promoted. In either case, the current timeline becomes historic. We
+		 * need to detect that so that we don't try to stream past the point
+		 * where we switched to another timeline. We check for promotion or
+		 * timeline switch after calculating FlushPtr, to avoid a race
+		 * condition: if the timeline becomes historic just after we checked
+		 * that it was still current, it's still be OK to stream it up to the
+		 * FlushPtr that was calculated before it became historic.
+		 */
+		bool		becameHistoric = false;
+
+		SendRqstPtr = GetStandbyFlushRecPtr();
+
+		if (!RecoveryInProgress())
+		{
+			/*
+			 * We have been promoted. RecoveryInProgress() updated
+			 * ThisTimeLineID to the new current timeline.
+			 */
+			am_cascading_walsender = false;
+			becameHistoric = true;
+		}
+		else
+		{
+			/*
+			 * Still a cascading standby. But is the timeline we're sending
+			 * still the one recovery is recovering from? ThisTimeLineID was
+			 * updated by the GetStandbyFlushRecPtr() call above.
+			 */
+			if (sendTimeLine != ThisTimeLineID)
+				becameHistoric = true;
+		}
+
+		if (becameHistoric)
+		{
+			/*
+			 * The timeline we were sending has become historic. Read the
+			 * timeline history file of the new timeline to see where exactly
+			 * we forked off from the timeline we were sending.
+			 */
+			List	   *history;
+
+			history = readTimeLineHistory(ThisTimeLineID);
+			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
+
+			Assert(sendTimeLine < sendTimeLineNextTLI);
+			list_free_deep(history);
+
+			sendTimeLineIsHistoric = true;
+
+			SendRqstPtr = sendTimeLineValidUpto;
+		}
+	}
+	else
+	{
+		/*
+		 * Streaming the current timeline on a primary.
+		 *
+		 * Attempt to send all data that's already been written out and
+		 * fsync'd to disk.  We cannot go further than what's been written out
+		 * given the current implementation of WALRead().  And in any case
+		 * it's unsafe to send WAL that is not securely down to disk on the
+		 * primary: if the primary subsequently crashes and restarts, standbys
+		 * must not have applied any WAL that got lost on the primary.
+		 */
+		SendRqstPtr = GetFlushRecPtr();
+	}
+
+	/*
+	 * Record the current system time as an approximation of the time at which
+	 * this WAL location was written for the purposes of lag tracking.
+	 *
+	 * In theory we could make XLogFlush() record a time in shmem whenever WAL
+	 * is flushed and we could get that time as well as the LSN when we call
+	 * GetFlushRecPtr() above (and likewise for the cascading standby
+	 * equivalent), but rather than putting any new code into the hot WAL path
+	 * it seems good enough to capture the time here.  We should reach this
+	 * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
+	 * may take some time, we read the WAL flush pointer and take the time
+	 * very close to together here so that we'll get a later position if it is
+	 * still moving.
+	 *
+	 * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
+	 * this gives us a cheap approximation for the WAL flush time for this
+	 * LSN.
+	 *
+	 * Note that the LSN is not necessarily the LSN for the data contained in
+	 * the present message; it's the end of the WAL, which might be further
+	 * ahead.  All the lag tracking machinery cares about is finding out when
+	 * that arbitrary LSN is eventually reported as written, flushed and
+	 * applied, so that it can measure the elapsed time.
+	 */
+	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
+
+	/*
+	 * If this is a historic timeline and we've reached the point where we
+	 * forked to the next timeline, stop streaming.
+	 *
+	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
+	 * startup process will normally replay all WAL that has been received
+	 * from the primary, before promoting, but if the WAL streaming is
+	 * terminated at a WAL page boundary, the valid portion of the timeline
+	 * might end in the middle of a WAL record. We might've already sent the
+	 * first half of that partial WAL record to the cascading standby, so that
+	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
+	 * replay the partial WAL record either, so it can still follow our
+	 * timeline switch.
+	 */
+	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
+	{
+		/* close the current file. */
+		if (xlogreader->seg.ws_file >= 0)
+			wal_segment_close(xlogreader);
+
+		/* Send CopyDone */
+		pq_putmessage_noblock('c', NULL, 0);
+		streamingDoneSending = true;
+
+		WalSndCaughtUp = true;
+
+		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
+			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
+			 LSN_FORMAT_ARGS(sentPtr));
+		return;
+	}
+
+	/* Do we have any work to do? */
+	Assert(sentPtr <= SendRqstPtr);
+	if (SendRqstPtr <= sentPtr)
+	{
+		WalSndCaughtUp = true;
+		return;
+	}
+
+	/*
+	 * Figure out how much to send in one message. If there's no more than
+	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
+	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
+	 *
+	 * The rounding is not only for performance reasons. Walreceiver relies on
+	 * the fact that we never split a WAL record across two messages. Since a
+	 * long WAL record is split at page boundary into continuation records,
+	 * page boundary is always a safe cut-off point. We also assume that
+	 * SendRqstPtr never points to the middle of a WAL record.
+	 */
+	startptr = sentPtr;
+	endptr = startptr;
+	endptr += MAX_SEND_SIZE;
+
+	/* if we went beyond SendRqstPtr, back off */
+	if (SendRqstPtr <= endptr)
+	{
+		endptr = SendRqstPtr;
+		if (sendTimeLineIsHistoric)
+			WalSndCaughtUp = false;
+		else
+			WalSndCaughtUp = true;
+	}
+	else
+	{
+		/* round down to page boundary. */
+		endptr -= (endptr % XLOG_BLCKSZ);
+		WalSndCaughtUp = false;
+	}
+
+	nbytes = endptr - startptr;
+	Assert(nbytes <= MAX_SEND_SIZE);
+
+	/* always true */
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, endptr);
+	}
+	else
+	{
+		/* code removed for brevity */
+	}
+	sentPtr = endptr;
+
+	/* Update shared memory status */
+	{
+		WalSnd	   *walsnd = MyWalSnd;
+
+		SpinLockAcquire(&walsnd->mutex);
+		walsnd->sentPtr = sentPtr;
+		SpinLockRelease(&walsnd->mutex);
+	}
+
+	/* Report progress of XLOG streaming in PS display */
+	if (update_process_title)
+	{
+		char		activitymsg[50];
+
+		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
+				 LSN_FORMAT_ARGS(sentPtr));
+		set_ps_display(activitymsg);
+	}
+}
+
diff --git a/contrib/neon/walproposer_utils.h b/contrib/neon/walproposer_utils.h
new file mode 100644
index 00000000000..4771d3ff829
--- /dev/null
+++ b/contrib/neon/walproposer_utils.h
@@ -0,0 +1,19 @@
+#ifndef __NEON_WALPROPOSER_UTILS_H__
+#define __NEON_WALPROPOSER_UTILS_H__
+
+#include "walproposer.h"
+
+int        CompareLsn(const void *a, const void *b);
+char*      FormatSafekeeperState(SafekeeperState state);
+void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
+uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
+char*      FormatEvents(uint32 events);
+bool       HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32     pq_getmsgint32_le(StringInfo msg);
+uint64     pq_getmsgint64_le(StringInfo msg);
+void       pq_sendint32_le(StringInfo buf, uint32 i);
+void       pq_sendint64_le(StringInfo buf, uint64 i);
+void       XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
+void       XLogWalPropClose(XLogRecPtr recptr);
+
+#endif /* __NEON_WALPROPOSER_UTILS_H__ */
diff --git a/src/Makefile b/src/Makefile
index 2f32e3d5137..79e274a4769 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -22,7 +22,6 @@ SUBDIRS = \
 	include \
 	interfaces \
 	backend/replication/libpqwalreceiver \
-	backend/replication/libpqwalproposer \
 	backend/replication/pgoutput \
 	fe_utils \
 	bin \
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 493f3cb45d4..750cd040a31 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -461,7 +461,7 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
-	if (backpressure_lag() > 0)
+	if (delay_backend_us != NULL && delay_backend_us() > 0)
 	{
 		InterruptPending = true;
 	}
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 8ca0b7f57fa..b63d7bfb1c7 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -35,7 +35,7 @@
 #include "common/username.h"
 #include "port/atomics.h"
 #include "postmaster/postmaster.h"
-#include "replication/walproposer.h"
+#include "replication/walpropshim.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/help_config.h"
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 2be49df0eb0..540a8454da2 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -22,7 +22,7 @@
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
-#include "replication/walproposer.h"
+#include "replication/walpropshim.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
@@ -129,9 +129,6 @@ static const struct
 	},
 	{
 		"ApplyWorkerMain", ApplyWorkerMain
-	},
-	{
-		"WalProposerMain", WalProposerMain
 	}
 };
 
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 04694ae0583..855b1de5de7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -117,7 +117,7 @@
 #include "postmaster/syslogger.h"
 #include "replication/logicallauncher.h"
 #include "replication/walsender.h"
-#include "replication/walproposer.h"
+#include "replication/walpropshim.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
@@ -998,11 +998,6 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	ApplyLauncherRegister();
 
-	/*
-	 * Start WAL proposer bgworker is wal acceptors list is not empty
-	 */
-	WalProposerRegister();
-
 	/*
 	 * process any libraries that should be preloaded at postmaster start
 	 */
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 23731a07576..4b7c3d32a4d 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -25,8 +25,7 @@ OBJS = \
 	walreceiver.o \
 	walreceiverfuncs.o \
 	walsender.o \
-	walproposer.o \
-	walproposer_utils.o
+	walpropcompat.o
 
 SUBDIRS = logical
 
diff --git a/src/backend/replication/libpqwalproposer/Makefile b/src/backend/replication/libpqwalproposer/Makefile
deleted file mode 100644
index c570160536f..00000000000
--- a/src/backend/replication/libpqwalproposer/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-#-------------------------------------------------------------------------
-#
-# Makefile--
-#    Makefile for src/backend/replication/libpqwalproposer
-#
-# IDENTIFICATION
-#    src/backend/replication/libpqwalproposer/Makefile
-#
-#-------------------------------------------------------------------------
-
-subdir = src/backend/replication/libpqwalproposer
-top_builddir = ../../../..
-include $(top_builddir)/src/Makefile.global
-
-override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS)
-
-OBJS = \
-	$(WIN32RES) \
-	libpqwalproposer.o
-SHLIB_LINK_INTERNAL = $(libpq)
-SHLIB_LINK = $(filter -lintl, $(LIBS))
-SHLIB_PREREQS = submake-libpq
-PGFILEDESC = "libpqwalproposer - libpq interface for WAL proposer"
-NAME = libpqwalproposer
-
-all: all-shared-lib
-
-include $(top_srcdir)/src/Makefile.shlib
-
-install: all installdirs install-lib
-
-installdirs: installdirs-lib
-
-uninstall: uninstall-lib
-
-clean distclean maintainer-clean: clean-lib
-	rm -f $(OBJS)
diff --git a/src/backend/replication/walpropcompat.c b/src/backend/replication/walpropcompat.c
new file mode 100644
index 00000000000..8caf2460795
--- /dev/null
+++ b/src/backend/replication/walpropcompat.c
@@ -0,0 +1,96 @@
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogdefs.h"
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "utils/guc.h"
+#include "replication/walpropshim.h"
+
+bool syncSafekeepers = false;
+void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId) = NULL;
+void (*WalProposerStart) (void) = NULL;
+
+/*
+ * Entry point for `postgres --sync-safekeepers`.
+ */
+void
+WalProposerSync(int argc, char *argv[])
+{
+	struct stat stat_buf;
+
+	syncSafekeepers = true;
+	ThisTimeLineID = 1;
+
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+					errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+					errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	ChangeToDataDir();
+
+	/* Create pg_wal directory, if it doesn't exist */
+	if (stat(XLOGDIR, &stat_buf) != 0)
+	{
+		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
+		if (MakePGDirectory(XLOGDIR) < 0)
+		{
+			ereport(ERROR,
+					(errcode_for_file_access(),
+						errmsg("could not create directory \"%s\": %m",
+							   XLOGDIR)));
+			exit(1);
+		}
+	}
+
+	load_file("neon", false);
+
+	if (NULL == WalProposerInit)
+		elog(ERROR, "Neon failed to register WalProposerInit");
+
+	if (NULL == WalProposerStart)
+		elog(ERROR, "Neon failed to register WalProposerStart");
+
+	WalProposerInit(0, 0);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
+}
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
deleted file mode 100644
index c9ddafdee0c..00000000000
--- a/src/backend/replication/walproposer_utils.c
+++ /dev/null
@@ -1,402 +0,0 @@
-#include "postgres.h"
-
-#include "replication/walproposer.h"
-#include "libpq/pqformat.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "../interfaces/libpq/libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-/*
- * These variables are used similarly to openLogFile/SegNo,
- * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
- * corresponding the filename of walpropFile.
- */
-static int	walpropFile = -1;
-static TimeLineID walpropFileTLI = 0;
-static XLogSegNo walpropSegNo = 0;
-
-int
-CompareLsn(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return -1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return 1;
-}
-
-/* Returns a human-readable string corresonding to the SafekeeperState
- *
- * The string should not be freed.
- *
- * The strings are intended to be used as a prefix to "state", e.g.:
- *
- *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
- *
- * If this sort of phrasing doesn't fit the message, instead use something like:
- *
- *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
- */
-char*
-FormatSafekeeperState(SafekeeperState state)
-{
-	char* return_val = NULL;
-
-	switch (state)
-	{
-		case SS_OFFLINE:
-			return_val = "offline";
-			break;
-		case SS_CONNECTING_READ:
-		case SS_CONNECTING_WRITE:
-			return_val = "connecting";
-			break;
-		case SS_WAIT_EXEC_RESULT:
-			return_val = "receiving query result";
-			break;
-		case SS_HANDSHAKE_RECV:
-			return_val = "handshake (receiving)";
-			break;
-		case SS_VOTING:
-			return_val = "voting";
-			break;
-		case SS_WAIT_VERDICT:
-			return_val = "wait-for-verdict";
-			break;
-		case SS_SEND_ELECTED_FLUSH:
-			return_val = "send-announcement-flush";
-			break;
-		case SS_IDLE:
-			return_val = "idle";
-			break;
-		case SS_ACTIVE:
-			return_val = "active";
-			break;
-	}
-
-	Assert(return_val != NULL);
-
-	return return_val;
-}
-
-/* Asserts that the provided events are expected for given safekeeper's state */
-void
-AssertEventsOkForState(uint32 events, Safekeeper* sk)
-{
-	uint32 expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/* The events are in-line with what we're expecting, under two conditions:
-	 *   (a) if we aren't expecting anything, `events` has no read- or
-	 *       write-ready component.
-	 *   (b) if we are expecting something, there's overlap
-	 *       (i.e. `events & expected != 0`)
-	 */
-	bool events_ok_for_state; /* long name so the `Assert` is more clear later */
-
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
-
-	if (!events_ok_for_state)
-	{
-		/* To give a descriptive message in the case of failure, we use elog and
-		 * then an assertion that's guaranteed to fail. */
-		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
-		Assert(events_ok_for_state);
-	}
-}
-
-/* Returns the set of events a safekeeper in this state should be waiting on
- *
- * This will return WL_NO_EVENTS (= 0) for some events. */
-uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
-{
-	uint32 result = WL_NO_EVENTS;
-
-	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
-	{
-		/* Connecting states say what they want in the name */
-		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
-		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
-
-		/* Reading states need the socket to be read-ready to continue */
-		case SS_WAIT_EXEC_RESULT:
-		case SS_HANDSHAKE_RECV:
-		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
-
-		/* Idle states use read-readiness as a sign that the connection has been
-		 * disconnected. */
-		case SS_VOTING:
-		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
-
-		/* 
-		 * Flush states require write-ready for flushing.
-		 * Active state does both reading and writing.
-		 * 
-		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
-		 * 	check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-		 */
-		case SS_SEND_ELECTED_FLUSH:
-		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-
-		/* The offline state expects no events. */
-		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
-
-		default:
-			Assert(false);
-			break;
-	}
-
-	return result;
-}
-
-/* Returns a human-readable string corresponding to the event set
- *
- * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
- * returned string may be meaingless.
- *
- * The string should not be freed. It should also not be expected to remain the same between
- * function calls. */
-char*
-FormatEvents(uint32 events)
-{
-	static char return_str[8];
-
-	/* Helper variable to check if there's extra bits */
-	uint32 all_flags = WL_LATCH_SET
-		| WL_SOCKET_READABLE
-		| WL_SOCKET_WRITEABLE
-		| WL_TIMEOUT
-		| WL_POSTMASTER_DEATH
-		| WL_EXIT_ON_PM_DEATH
-		| WL_SOCKET_CONNECTED;
-
-	/* The formatting here isn't supposed to be *particularly* useful -- it's just to give an
-	 * sense of what events have been triggered without needing to remember your powers of two. */
-
-	return_str[0] = (events & WL_LATCH_SET       ) ? 'L' : '_';
-	return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_';
-	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
-	return_str[3] = (events & WL_TIMEOUT         ) ? 'T' : '_';
-	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
-	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
-	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
-
-	if (events & (~all_flags))
-	{
-		elog(WARNING, "Event formatting found unexpected component %d",
-				events & (~all_flags));
-		return_str[6] = '*';
-		return_str[7] = '\0';
-	}
-	else
-		return_str[6] = '\0';
-
-	return (char *) &return_str;
-}
-
-/*
- * Convert a character which represents a hexadecimal digit to an integer.
- *
- * Returns -1 if the character is not a hexadecimal digit.
- */
-static int
-HexDecodeChar(char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 10;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 10;
-
-	return -1;
-}
-
-/*
- * Decode a hex string into a byte string, 2 hex chars per byte.
- *
- * Returns false if invalid characters are encountered; otherwise true.
- */
-bool
-HexDecodeString(uint8 *result, char *input, int nbytes)
-{
-	int			i;
-
-	for (i = 0; i < nbytes; ++i)
-	{
-		int			n1 = HexDecodeChar(input[i * 2]);
-		int			n2 = HexDecodeChar(input[i * 2 + 1]);
-
-		if (n1 < 0 || n2 < 0)
-			return false;
-		result[i] = n1 * 16 + n2;
-	}
-
-	return true;
-}
-
-/* --------------------------------
- *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint32
-pq_getmsgint32_le(StringInfo msg)
-{
-	uint32		n32;
-
-	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
-
-	return n32;
-}
-
-/* --------------------------------
- *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint64
-pq_getmsgint64_le(StringInfo msg)
-{
-	uint64		n64;
-
-	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
-
-	return n64;
-}
-
-/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint32_le(StringInfo buf, uint32 i)
-{
-	enlargeStringInfo(buf, sizeof(uint32));
-	memcpy(buf->data + buf->len, &i, sizeof(uint32));
-	buf->len += sizeof(uint32);
-}
-
-/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint64_le(StringInfo buf, uint64 i)
-{
-	enlargeStringInfo(buf, sizeof(uint64));
-	memcpy(buf->data + buf->len, &i, sizeof(uint64));
-	buf->len += sizeof(uint64);
-}
-
-/*
- * Write XLOG data to disk.
- */
-void
-XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
-{
-	int			startoff;
-	int			byteswritten;
-
-	while (nbytes > 0)
-	{
-		int			segbytes;
-
-		/* Close the current segment if it's completed */
-		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-			XLogWalPropClose(recptr);
-
-		if (walpropFile < 0)
-		{
-			bool		use_existent = true;
-
-			/* Create/use new log file */
-			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
-			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
-			walpropFileTLI = ThisTimeLineID;
-		}
-
-		/* Calculate the start offset of the received logs */
-		startoff = XLogSegmentOffset(recptr, wal_segment_size);
-
-		if (startoff + nbytes > wal_segment_size)
-			segbytes = wal_segment_size - startoff;
-		else
-			segbytes = nbytes;
-
-		/* OK to write the logs */
-		errno = 0;
-
-		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
-		if (byteswritten <= 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno;
-
-			/* if write didn't set errno, assume no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-
-			save_errno = errno;
-			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-			errno = save_errno;
-			ereport(PANIC,
-					(errcode_for_file_access(),
-					 errmsg("could not write to log segment %s "
-							"at offset %u, length %lu: %m",
-							xlogfname, startoff, (unsigned long) segbytes)));
-		}
-
-		/* Update state for write */
-		recptr += byteswritten;
-
-		nbytes -= byteswritten;
-		buf += byteswritten;
-	}
-
-	/*
-	 * Close the current segment if it's fully written up in the last cycle of
-	 * the loop.
-	 */
-	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-	{
-		XLogWalPropClose(recptr);
-	}
-}
-
-/*
- * Close the current segment.
- */
-void
-XLogWalPropClose(XLogRecPtr recptr)
-{
-	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
-
-	if (close(walpropFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close log segment %s: %m",
-						xlogfname)));
-	}
-
-	walpropFile = -1;
-}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index e97184490c5..ce16a78a61c 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -73,7 +73,7 @@
 #include "replication/slot.h"
 #include "replication/snapbuild.h"
 #include "replication/syncrep.h"
-#include "replication/walproposer.h"
+#include "replication/walpropshim.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
@@ -130,6 +130,12 @@ bool		log_replication_commands = false;
  */
 bool		wake_wal_senders = false;
 
+
+/*
+ * Backpressure hook, detecting how much we should delay.
+ */
+uint64 (*delay_backend_us)(void) = NULL;
+
 /*
  * xlogreader used for replication.  Note that a WAL sender doing physical
  * replication does not need xlogreader to read WAL, but it needs one to
@@ -235,11 +241,10 @@ static XLogRecPtr GetStandbyFlushRecPtr(void);
 static void IdentifySystem(void);
 static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd);
 static void DropReplicationSlot(DropReplicationSlotCmd *cmd);
-void StartReplication(StartReplicationCmd *cmd);
+static void StartReplication(StartReplicationCmd *cmd);
 static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
-static void ProcessReplicationFeedbackMessage(void);
 static void ProcessStandbyHSFeedbackMessage(void);
 static void ProcessRepliesIfAny(void);
 static void ProcessPendingWrites(void);
@@ -252,8 +257,6 @@ static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, Tran
 static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
 static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid);
 static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
-static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time);
-static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now);
 static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch);
 
 static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
@@ -570,7 +573,7 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd)
  * At the moment, this never returns, but an ereport(ERROR) will take us back
  * to the main loop.
  */
-void
+static void
 StartReplication(StartReplicationCmd *cmd)
 {
 	StringInfoData buf;
@@ -711,14 +714,11 @@ StartReplication(StartReplicationCmd *cmd)
 		WalSndSetState(WALSNDSTATE_CATCHUP);
 
 		/* Send a CopyBothResponse message, and start streaming */
-		if (!am_wal_proposer)
-		{
-			pq_beginmessage(&buf, 'W');
-			pq_sendbyte(&buf, 0);
-			pq_sendint16(&buf, 0);
-			pq_endmessage(&buf);
-			pq_flush();
-		}
+		pq_beginmessage(&buf, 'W');
+		pq_sendbyte(&buf, 0);
+		pq_sendint16(&buf, 0);
+		pq_endmessage(&buf);
+		pq_flush();
 
 		/*
 		 * Don't allow a request to stream from a future point in WAL that
@@ -1340,7 +1340,7 @@ ProcessPendingWrites(void)
 		}
 
 		/* Try to flush pending output to the client */
-		if (!am_wal_proposer && pq_flush_if_writable() != 0)
+		if (pq_flush_if_writable() != 0)
 			WalSndShutdown();
 	}
 
@@ -1749,9 +1749,6 @@ ProcessRepliesIfAny(void)
 	int			r;
 	bool		received = false;
 
-	if (am_wal_proposer)
-		return;
-
 	last_processing = GetCurrentTimestamp();
 
 	/*
@@ -1877,10 +1874,6 @@ ProcessStandbyMessage(void)
 			ProcessStandbyHSFeedbackMessage();
 			break;
 
-		case 'z':
-			ProcessReplicationFeedbackMessage();
-			break;
-
 		default:
 			ereport(COMMERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -1953,28 +1946,6 @@ ProcessStandbyReplyMessage(void)
 					LSN_FORMAT_ARGS(applyPtr));
 }
 
-// This message is a neon extension of postgres replication protocol
-static void
-ProcessReplicationFeedbackMessage(void)
-{
-	ReplicationFeedback rf;
-
-	// consume message length
-	pq_getmsgint64(&reply_message);
-
-	ParseReplicationFeedbackMessage(&reply_message, &rf);
-
-	replication_feedback_set(&rf);
-
-	SetZenithCurrentClusterSize(rf.currentClusterSize);
-
-	ProcessStandbyReply(rf.ps_writelsn,
-						rf.ps_flushlsn,
-						rf.ps_applylsn,
-						rf.ps_replytime,
-						false);
-}
-
 void
 ProcessStandbyReply(XLogRecPtr	writePtr,
 					XLogRecPtr	flushPtr,
@@ -2058,13 +2029,6 @@ ProcessStandbyReply(XLogRecPtr	writePtr,
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 
-	/*
-	 * walproposer use trunclateLsn instead of flushPtr for confirmed
-	 * received location, so we shouldn't update restart_lsn here.
-	 */
-	if (am_wal_proposer)
-		return;
-
 	/*
 	 * Advance our local xmin horizon when the client confirmed a flush.
 	 */
@@ -2394,19 +2358,6 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		/* Check for input from the client */
 		ProcessRepliesIfAny();
 
-		if (am_wal_proposer)
-		{
-			send_data();
-			if (WalSndCaughtUp)
-			{
-				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-					WalSndSetState(WALSNDSTATE_STREAMING);
-				WalProposerPoll();
-				WalSndCaughtUp = false;
-			}
-			continue;
-		}
-
 		/*
 		 * If we have received CopyDone from the client, sent CopyDone
 		 * ourselves, and the output buffer is empty, it's time to exit
@@ -2865,83 +2816,77 @@ XLogSendPhysical(void)
 	nbytes = endptr - startptr;
 	Assert(nbytes <= MAX_SEND_SIZE);
 
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, endptr);
-	}
+	/*
+	* OK to read and send the slice.
+	*/
+	if (output_message.data)
+		resetStringInfo(&output_message);
 	else
-	{
-		/*
-		* OK to read and send the slice.
-		*/
-		if (output_message.data)
-			resetStringInfo(&output_message);
-		else
-			initStringInfo(&output_message);
+		initStringInfo(&output_message);
+
+	pq_sendbyte(&output_message, 'w');
+	pq_sendint64(&output_message, startptr);	/* dataStart */
+	pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
+	pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
+
+	/*
+	* Read the log directly into the output buffer to avoid extra memcpy
+	* calls.
+	*/
+	enlargeStringInfo(&output_message, nbytes);
+
+retry:
+	if (!WALRead(xlogreader,
+				&output_message.data[output_message.len],
+				startptr,
+				nbytes,
+				xlogreader->seg.ws_tli,	/* Pass the current TLI because
+											* only WalSndSegmentOpen controls
+											* whether new TLI is needed. */
+				&errinfo))
+		WALReadRaiseError(&errinfo);
 
-		pq_sendbyte(&output_message, 'w');
-		pq_sendint64(&output_message, startptr);	/* dataStart */
-		pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
-		pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
+	/* See logical_read_xlog_page(). */
+	XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
+	CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
 
-		/*
-		* Read the log directly into the output buffer to avoid extra memcpy
-		* calls.
-		*/
-		enlargeStringInfo(&output_message, nbytes);
-
-	retry:
-		if (!WALRead(xlogreader,
-					&output_message.data[output_message.len],
-					startptr,
-					nbytes,
-					xlogreader->seg.ws_tli,	/* Pass the current TLI because
-												* only WalSndSegmentOpen controls
-												* whether new TLI is needed. */
-					&errinfo))
-			WALReadRaiseError(&errinfo);
-
-		/* See logical_read_xlog_page(). */
-		XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
-		CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
-
-		/*
-		* During recovery, the currently-open WAL file might be replaced with the
-		* file of the same name retrieved from archive. So we always need to
-		* check what we read was valid after reading into the buffer. If it's
-		* invalid, we try to open and read the file again.
-		*/
-		if (am_cascading_walsender)
-		{
-			WalSnd	   *walsnd = MyWalSnd;
-			bool		reload;
+	/*
+	* During recovery, the currently-open WAL file might be replaced with the
+	* file of the same name retrieved from archive. So we always need to
+	* check what we read was valid after reading into the buffer. If it's
+	* invalid, we try to open and read the file again.
+	*/
+	if (am_cascading_walsender)
+	{
+		WalSnd	   *walsnd = MyWalSnd;
+		bool		reload;
 
-			SpinLockAcquire(&walsnd->mutex);
-			reload = walsnd->needreload;
-			walsnd->needreload = false;
-			SpinLockRelease(&walsnd->mutex);
+		SpinLockAcquire(&walsnd->mutex);
+		reload = walsnd->needreload;
+		walsnd->needreload = false;
+		SpinLockRelease(&walsnd->mutex);
 
-			if (reload && xlogreader->seg.ws_file >= 0)
-			{
-				wal_segment_close(xlogreader);
+		if (reload && xlogreader->seg.ws_file >= 0)
+		{
+			wal_segment_close(xlogreader);
 
-				goto retry;
-			}
+			goto retry;
 		}
+	}
 
-		output_message.len += nbytes;
-		output_message.data[output_message.len] = '\0';
+	output_message.len += nbytes;
+	output_message.data[output_message.len] = '\0';
 
-		/*
-		 * Fill the send timestamp last, so that it is taken as late as possible.
-		 */
-		resetStringInfo(&tmpbuf);
-		pq_sendint64(&tmpbuf, GetCurrentTimestamp());
-		memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
-			   tmpbuf.data, sizeof(int64));
+	/*
+	 * Fill the send timestamp last, so that it is taken as late as possible.
+	 */
+	resetStringInfo(&tmpbuf);
+	pq_sendint64(&tmpbuf, GetCurrentTimestamp());
+	memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
+		   tmpbuf.data, sizeof(int64));
+
+	pq_putmessage_noblock('d', output_message.data, output_message.len);
 
-		pq_putmessage_noblock('d', output_message.data, output_message.len);
-	}
 	sentPtr = endptr;
 
 	/* Update shared memory status */
@@ -3657,7 +3602,7 @@ WalSndKeepaliveIfNecessary(void)
  * eventually reported to have been written, flushed and applied by the
  * standby in a reply message.
  */
-static void
+void
 LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
 {
 	bool		buffer_full;
@@ -3722,7 +3667,7 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
  * Return -1 if no new sample data is available, and otherwise the elapsed
  * time in microseconds.
  */
-static TimeOffset
+TimeOffset
 LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 {
 	TimestampTz time = 0;
@@ -3819,47 +3764,3 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 	Assert(time != 0);
 	return now - time;
 }
-
-// Check if we need to suspend inserts because of lagging replication.
-uint64
-backpressure_lag(void)
-{
-	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
-	{
-		XLogRecPtr writePtr;
-		XLogRecPtr flushPtr;
-		XLogRecPtr applyPtr;
-		XLogRecPtr myFlushLsn = GetFlushRecPtr();
-
-		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
-		#define MB ((XLogRecPtr)1024*1024)
-
-		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
-			LSN_FORMAT_ARGS(myFlushLsn),
-			LSN_FORMAT_ARGS(writePtr),
-			LSN_FORMAT_ARGS(flushPtr),
-			LSN_FORMAT_ARGS(applyPtr));
-
-		if ((writePtr != InvalidXLogRecPtr
-			&& max_replication_write_lag > 0
-			&& myFlushLsn > writePtr + max_replication_write_lag*MB))
-		{
-			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
-		}
-
-		if ((flushPtr != InvalidXLogRecPtr
-			&& max_replication_flush_lag > 0
-			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
-		{
-			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
-		}
-
-		if ((applyPtr != InvalidXLogRecPtr
-			&& max_replication_apply_lag > 0
-			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
-		{
-			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
-		}
-	}
-	return 0;
-}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 233bd081f82..de498c21dba 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -34,7 +34,6 @@
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
-#include "replication/walproposer.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
@@ -152,8 +151,6 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
 
-		size = add_size(size, WalproposerShmemSize());
-
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -274,8 +271,6 @@ CreateSharedMemoryAndSemaphores(void)
 	SyncScanShmemInit();
 	AsyncShmemInit();
 
-	WalproposerShmemInit();
-
 #ifdef EXEC_BACKEND
 
 	/*
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 7ea135c7c9e..3b15ffab8e7 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3401,15 +3401,20 @@ ProcessInterrupts(void)
 	{
 		ProcessInterrupts_pg();
 
-		// Suspend writers until replicas catch up
-		lag = backpressure_lag();
-		if (lag <= 0)
-			break;
+		if (delay_backend_us != NULL)
+		{
+			// Suspend writers until replicas catch up
+			lag = delay_backend_us();
+			if (lag <= 0)
+				break;
 
-		set_ps_display("backpressure throttling");
+			set_ps_display("backpressure throttling");
 
-		elog(DEBUG2, "backpressure throttling: lag %lu", lag);
-		pg_usleep(BACK_PRESSURE_DELAY);
+			elog(DEBUG2, "backpressure throttling: lag %lu", lag);
+			pg_usleep(BACK_PRESSURE_DELAY);
+		}
+		else
+			break;
 	}
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 500aa672611..218ab8bf673 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -80,7 +80,7 @@
 #include "replication/syncrep.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
-#include "replication/walproposer.h"
+#include "replication/walpropshim.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
@@ -2310,28 +2310,6 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
-	{
-		{"wal_acceptor_reconnect", PGC_SIGHUP, REPLICATION_STANDBY,
-			gettext_noop("Timeout for reconnecting to offline wal acceptor."),
-			NULL,
-			GUC_UNIT_MS
-		},
-		&wal_acceptor_reconnect_timeout,
-		1000, 0, INT_MAX,
-		NULL, NULL, NULL
-	},
-
-	{
-		{"wal_acceptor_connect_timeout", PGC_SIGHUP, REPLICATION_STANDBY,
-			gettext_noop("Timeout after which give up connection attempt to safekeeper."),
-			NULL,
-			GUC_UNIT_MS
-		},
-		&wal_acceptor_connect_timeout,
-		5000, 0, INT_MAX,
-		NULL, NULL, NULL
-	},
-
 	{
 		{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the maximum number of concurrent connections."),
@@ -4672,17 +4650,6 @@ static struct config_string ConfigureNamesString[] =
 		check_backtrace_functions, assign_backtrace_functions, NULL
 	},
 
-	{
-		{"safekeepers", PGC_POSTMASTER, UNGROUPED,
-			gettext_noop("List of Neon WAL acceptors (host:port)"),
-			NULL,
-			GUC_LIST_INPUT | GUC_LIST_QUOTE
-		},
-		&wal_acceptors_list,
-		"",
-		NULL, NULL, NULL
-	},
-
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
diff --git a/src/include/replication/walpropshim.h b/src/include/replication/walpropshim.h
new file mode 100644
index 00000000000..07757580cc9
--- /dev/null
+++ b/src/include/replication/walpropshim.h
@@ -0,0 +1,19 @@
+/*
+ * walpropshim.h
+ *	  various hooks for the walproposer component of the Neon extension.
+ */
+
+#ifndef __WALPROPOSER_H__
+#define __WALPROPOSER_H__
+
+/*
+ * Set to true only in standalone run of `postgres --sync-safekeepers`.
+ * See also the top comment in contrib/neon/walproposer.c
+ */
+extern PGDLLIMPORT bool syncSafekeepers;
+extern PGDLLIMPORT void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId);
+extern PGDLLIMPORT void (*WalProposerStart) (void);
+
+void       WalProposerSync(int argc, char *argv[]);
+
+#endif
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index b597823c7c3..f902457c26b 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -48,7 +48,25 @@ extern void WalSndInitStopping(void);
 extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
-extern uint64 backpressure_lag(void);
+
+/*
+ * Hook to check for WAL receiving backpressure.
+ * Return value in microseconds */
+extern uint64 (*delay_backend_us)(void);
+
+/* expose these so that they can be reused by the neon walproposer extension */
+extern void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time);
+extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now);
+extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr,
+								XLogRecPtr applyPtr, TimestampTz replyTime,
+								bool replyRequested);
+void       PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
+void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
+									TransactionId feedbackXmin,
+									uint32		feedbackEpoch,
+									TransactionId feedbackCatalogXmin,
+									uint32		feedbackCatalogEpoch);
+
 /*
  * Remember that we want to wakeup walsenders later
  *

From 186bca0e72a7c7c505c396fc5400b8488678f943 Mon Sep 17 00:00:00 2001
From: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Date: Fri, 19 Aug 2022 11:00:15 +0200
Subject: [PATCH 161/214] Use ECR for image (#195)

* Use ECR for image

* Keep arg consistent across dockerfiles

Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
---
 Dockerfile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 11681c9cb16..77648e21bfd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,13 @@
-# Allow specifiyng the different compute-tools tag, so we were able to always use
-# the locally built image.
-ARG COMPUTE_TOOLS_TAG=latest
+# Allow specifiyng different compute-tools tag and image repo, so we are
+# able to use different images
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=compute-tools
+ARG TAG=latest
 
 #
 # Image with pre-built tools
 #
-FROM neondatabase/compute-tools:$COMPUTE_TOOLS_TAG AS compute-deps
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
 # Only to get ready compute_ctl binary as deppendency
 
 #

From a8296088d100105375b22c52d7398ec7cce11f1b Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor.suvorov@gmail.com>
Date: Thu, 25 Aug 2022 12:16:37 +0200
Subject: [PATCH 162/214] walproposer: do not send pageserver connstring with
 START_WAL_PUSH (#198)

It is not used anymore since https://github.com/neondatabase/neon/pull/1872

Fixes https://github.com/neondatabase/cloud/issues/2032
---
 contrib/neon/libpagestore.c |  6 ------
 contrib/neon/walproposer.c  | 11 +----------
 contrib/neon/walproposer.h  |  1 -
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c
index 15a4a769934..8678c306dee 100644
--- a/contrib/neon/libpagestore.c
+++ b/contrib/neon/libpagestore.c
@@ -417,12 +417,6 @@ pg_init_libpagestore(void)
 	zenith_timeline_walproposer = zenith_timeline;
 	zenith_tenant_walproposer = zenith_tenant;
 
-	/*
-	 * Walproposer instructs safekeeper which pageserver to use for
-	 * replication
-	 */
-	zenith_pageserver_connstring_walproposer = page_server_connstring;
-
 	if (wal_redo)
 	{
 		neon_log(PageStoreTrace, "set inmem_smgr hook");
diff --git a/contrib/neon/walproposer.c b/contrib/neon/walproposer.c
index 245b45727e5..9625325c0a9 100644
--- a/contrib/neon/walproposer.c
+++ b/contrib/neon/walproposer.c
@@ -73,7 +73,6 @@ bool		am_wal_proposer;
 
 char	   *zenith_timeline_walproposer = NULL;
 char	   *zenith_tenant_walproposer = NULL;
-char	   *zenith_pageserver_connstring_walproposer = NULL;
 
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
 WalProposerFunctionsType *WalProposerFunctions = NULL;
@@ -879,21 +878,13 @@ HandleConnectionEvent(Safekeeper *sk)
 static void
 SendStartWALPush(Safekeeper *sk)
 {
-	char *query = NULL;
-	if (zenith_pageserver_connstring_walproposer != NULL) {
-		query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
-	} else {
-		query = psprintf("START_WAL_PUSH");
-	}
-	if (!walprop_send_query(sk->conn, query))
+	if (!walprop_send_query(sk->conn, "START_WAL_PUSH"))
 	{
-		pfree(query);
 		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
 			sk->host, sk->port, walprop_error_message(sk->conn));
 		ShutdownConnection(sk);
 		return;
 	}
-	pfree(query);
 	sk->state = SS_WAIT_EXEC_RESULT;
 	UpdateEventSet(sk, WL_SOCKET_READABLE);
 }
diff --git a/contrib/neon/walproposer.h b/contrib/neon/walproposer.h
index 3dd8bc19191..b684d5264f7 100644
--- a/contrib/neon/walproposer.h
+++ b/contrib/neon/walproposer.h
@@ -38,7 +38,6 @@ typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
 extern char *zenith_tenant_walproposer;
-extern char	*zenith_pageserver_connstring_walproposer;
 
 /* Possible return values from ReadPGAsync */
 typedef enum

From d8ee64dd95c9533f4003d2bf0327a1d64f2ee080 Mon Sep 17 00:00:00 2001
From: MMeent <boekewurm@gmail.com>
Date: Thu, 25 Aug 2022 18:45:01 +0200
Subject: [PATCH 163/214] Remove neon extensions from the vendor repo (#197)

---
 contrib/Makefile                              |    2 -
 contrib/neon/Makefile                         |   34 -
 contrib/neon/inmem_smgr.c                     |  286 --
 contrib/neon/libpagestore.c                   |  433 ---
 contrib/neon/libpqwalproposer.c               |  413 ---
 contrib/neon/neon--1.0.sql                    |   17 -
 contrib/neon/neon.c                           |   79 -
 contrib/neon/neon.control                     |    4 -
 contrib/neon/neon.h                           |   19 -
 contrib/neon/pagestore_client.h               |  221 --
 contrib/neon/pagestore_smgr.c                 | 1696 ------------
 contrib/neon/relsize_cache.c                  |  167 --
 contrib/neon/walproposer.c                    | 2403 -----------------
 contrib/neon/walproposer.h                    |  540 ----
 contrib/neon/walproposer_utils.c              | 1110 --------
 contrib/neon/walproposer_utils.h              |   19 -
 contrib/neon_test_utils/Makefile              |   25 -
 .../neon_test_utils/neon_test_utils--1.0.sql  |   29 -
 .../neon_test_utils/neon_test_utils.control   |    5 -
 contrib/neon_test_utils/neontest.c            |  304 ---
 20 files changed, 7806 deletions(-)
 delete mode 100644 contrib/neon/Makefile
 delete mode 100644 contrib/neon/inmem_smgr.c
 delete mode 100644 contrib/neon/libpagestore.c
 delete mode 100644 contrib/neon/libpqwalproposer.c
 delete mode 100644 contrib/neon/neon--1.0.sql
 delete mode 100644 contrib/neon/neon.c
 delete mode 100644 contrib/neon/neon.control
 delete mode 100644 contrib/neon/neon.h
 delete mode 100644 contrib/neon/pagestore_client.h
 delete mode 100644 contrib/neon/pagestore_smgr.c
 delete mode 100644 contrib/neon/relsize_cache.c
 delete mode 100644 contrib/neon/walproposer.c
 delete mode 100644 contrib/neon/walproposer.h
 delete mode 100644 contrib/neon/walproposer_utils.c
 delete mode 100644 contrib/neon/walproposer_utils.h
 delete mode 100644 contrib/neon_test_utils/Makefile
 delete mode 100644 contrib/neon_test_utils/neon_test_utils--1.0.sql
 delete mode 100644 contrib/neon_test_utils/neon_test_utils.control
 delete mode 100644 contrib/neon_test_utils/neontest.c

diff --git a/contrib/Makefile b/contrib/Makefile
index 9caec6cb81f..f27e458482e 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -26,8 +26,6 @@ SUBDIRS = \
 		isn		\
 		lo		\
 		ltree		\
-		neon	\
-		neon_test_utils	\
 		oid2name	\
 		old_snapshot	\
 		pageinspect	\
diff --git a/contrib/neon/Makefile b/contrib/neon/Makefile
deleted file mode 100644
index d1f48a988c6..00000000000
--- a/contrib/neon/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-# contrib/neon/Makefile
-
-
-MODULE_big = neon
-OBJS = \
-	$(WIN32RES) \
-	inmem_smgr.o \
-	libpagestore.o \
-	libpqwalproposer.o \
-	pagestore_smgr.o \
-	relsize_cache.o \
-	neon.o \
-	walproposer.o \
-	walproposer_utils.o
-
-PG_CPPFLAGS = -I$(libpq_srcdir)
-SHLIB_LINK_INTERNAL = $(libpq)
-
-EXTENSION = neon
-DATA = neon--1.0.sql
-PGFILEDESC = "neon - cloud storage for PostgreSQL"
-
-
-ifdef USE_PGXS
-PG_CONFIG = pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-else
-SHLIB_PREREQS = submake-libpq
-subdir = contrib/neon
-top_builddir = ../..
-include $(top_builddir)/src/Makefile.global
-include $(top_srcdir)/contrib/contrib-global.mk
-endif
diff --git a/contrib/neon/inmem_smgr.c b/contrib/neon/inmem_smgr.c
deleted file mode 100644
index 7840292b08c..00000000000
--- a/contrib/neon/inmem_smgr.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * inmem_smgr.c
- *
- * This is an implementation of the SMGR interface, used in the WAL redo
- * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
- * storage, the pages that are written out are kept in a small number of
- * in-memory buffers.
- *
- * Normally, replaying a WAL record only needs to access a handful of
- * buffers, which fit in the normal buffer cache, so this is just for
- * "overflow" storage when the buffer cache is not large enough.
- *
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * IDENTIFICATION
- *	  contrib/neon/inmem_smgr.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "access/xlog.h"
-#include "pagestore_client.h"
-#include "storage/block.h"
-#include "storage/buf_internals.h"
-#include "storage/relfilenode.h"
-#include "storage/smgr.h"
-
-/* Size of the in-memory smgr */
-#define MAX_PAGES 64
-
-/* If more than WARN_PAGES are used, print a warning in the log */
-#define WARN_PAGES 32
-
-static BufferTag page_tag[MAX_PAGES];
-static char page_body[MAX_PAGES][BLCKSZ];
-static int	used_pages;
-
-static int
-locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
-{
-	/* We only hold a small number of pages, so linear search */
-	for (int i = 0; i < used_pages; i++)
-	{
-		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
-			&& forknum == page_tag[i].forkNum
-			&& blkno == page_tag[i].blockNum)
-		{
-			return i;
-		}
-	}
-	return -1;
-}
-
-/*
- *	inmem_init() -- Initialize private state
- */
-void
-inmem_init(void)
-{
-	used_pages = 0;
-}
-
-/*
- *	inmem_exists() -- Does the physical file exist?
- */
-bool
-inmem_exists(SMgrRelation reln, ForkNumber forknum)
-{
-	for (int i = 0; i < used_pages; i++)
-	{
-		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
-			&& forknum == page_tag[i].forkNum)
-		{
-			return true;
-		}
-	}
-	return false;
-}
-
-/*
- *	inmem_create() -- Create a new relation on zenithd storage
- *
- * If isRedo is true, it's okay for the relation to exist already.
- */
-void
-inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
-{
-}
-
-/*
- *	inmem_unlink() -- Unlink a relation.
- */
-void
-inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
-{
-}
-
-/*
- *	inmem_extend() -- Add a block to the specified relation.
- *
- *		The semantics are nearly the same as mdwrite(): write at the
- *		specified position.  However, this is to be used for the case of
- *		extending a relation (i.e., blocknum is at or beyond the current
- *		EOF).  Note that we assume writing a block beyond current EOF
- *		causes intervening file space to become filled with zeroes.
- */
-void
-inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
-			 char *buffer, bool skipFsync)
-{
-	/* same as smgwrite() for us */
-	inmem_write(reln, forknum, blkno, buffer, skipFsync);
-}
-
-/*
- *  inmem_open() -- Initialize newly-opened relation.
- */
-void
-inmem_open(SMgrRelation reln)
-{
-}
-
-/*
- *	inmem_close() -- Close the specified relation, if it isn't closed already.
- */
-void
-inmem_close(SMgrRelation reln, ForkNumber forknum)
-{
-}
-
-/*
- *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
- */
-bool
-inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
-{
-	return true;
-}
-
-/*
- * inmem_writeback() -- Tell the kernel to write pages back to storage.
- */
-void
-inmem_writeback(SMgrRelation reln, ForkNumber forknum,
-				BlockNumber blocknum, BlockNumber nblocks)
-{
-}
-
-/*
- *	inmem_read() -- Read the specified block from a relation.
- */
-void
-inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
-		   char *buffer)
-{
-	int			pg;
-
-	pg = locate_page(reln, forknum, blkno);
-	if (pg < 0)
-		memset(buffer, 0, BLCKSZ);
-	else
-		memcpy(buffer, page_body[pg], BLCKSZ);
-}
-
-/*
- *	inmem_write() -- Write the supplied block at the appropriate location.
- *
- *		This is to be used only for updating already-existing blocks of a
- *		relation (ie, those before the current EOF).  To extend a relation,
- *		use mdextend().
- */
-void
-inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-			char *buffer, bool skipFsync)
-{
-	int			pg;
-
-	pg = locate_page(reln, forknum, blocknum);
-	if (pg < 0)
-	{
-		/*
-		 * We assume the buffer cache is large enough to hold all the buffers
-		 * needed for most operations. Overflowing to this "in-mem smgr" in rare
-		 * cases is OK. But if we find that we're using more than WARN_PAGES,
-		 * print a warning so that we get alerted and get to investigate why
-		 * we're accessing so many buffers.
-		 */
-		elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
-			 "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum,
-			 blocknum,
-			 used_pages);
-		if (used_pages == MAX_PAGES)
-			elog(ERROR, "Inmem storage overflow");
-
-		pg = used_pages;
-		used_pages++;
-		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
-	}  else {
-		elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum,
-			 blocknum,
-			 used_pages);
-	}
-	memcpy(page_body[pg], buffer, BLCKSZ);
-}
-
-/*
- *	inmem_nblocks() -- Get the number of blocks stored in a relation.
- */
-BlockNumber
-inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
-{
-	/*
-	 * It's not clear why a WAL redo function would call smgrnblocks().
-	 * During recovery, at least before reaching consistency, the size of a
-	 * relation could be arbitrarily small, if it was truncated after the
-	 * record being replayed, or arbitrarily large if it was extended
-	 * afterwards. But one place where it's called is in
-	 * XLogReadBufferExtended(): it extends the relation, if it's smaller than
-	 * the requested page. That's a waste of time in the WAL redo
-	 * process. Pretend that all relations are maximally sized to avoid it.
-	 */
-	return MaxBlockNumber;
-}
-
-/*
- *	inmem_truncate() -- Truncate relation to specified number of blocks.
- */
-void
-inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
-{
-}
-
-/*
- *	inmem_immedsync() -- Immediately sync a relation to stable storage.
- */
-void
-inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
-{
-}
-
-static const struct f_smgr inmem_smgr =
-{
-	.smgr_init = inmem_init,
-	.smgr_shutdown = NULL,
-	.smgr_open = inmem_open,
-	.smgr_close = inmem_close,
-	.smgr_create = inmem_create,
-	.smgr_exists = inmem_exists,
-	.smgr_unlink = inmem_unlink,
-	.smgr_extend = inmem_extend,
-	.smgr_prefetch = inmem_prefetch,
-	.smgr_read = inmem_read,
-	.smgr_write = inmem_write,
-	.smgr_writeback = inmem_writeback,
-	.smgr_nblocks = inmem_nblocks,
-	.smgr_truncate = inmem_truncate,
-	.smgr_immedsync = inmem_immedsync,
-};
-
-const f_smgr *
-smgr_inmem(BackendId backend, RelFileNode rnode)
-{
-	Assert(InRecovery);
-	if (backend != InvalidBackendId)
-		return smgr_standard(backend, rnode);
-	else
-		return &inmem_smgr;
-}
-
-void
-smgr_init_inmem()
-{
-	inmem_init();
-}
diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c
deleted file mode 100644
index 8678c306dee..00000000000
--- a/contrib/neon/libpagestore.c
+++ /dev/null
@@ -1,433 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * libpagestore.c
- *	  Handles network communications with the remote pagestore.
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	 contrib/neon/libpqpagestore.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "pagestore_client.h"
-#include "fmgr.h"
-#include "access/xlog.h"
-
-#include "libpq-fe.h"
-#include "libpq/pqformat.h"
-#include "libpq/libpq.h"
-
-#include "miscadmin.h"
-#include "pgstat.h"
-#include "utils/guc.h"
-
-#include "neon.h"
-#include "walproposer.h"
-#include "walproposer_utils.h"
-
-
-#define PageStoreTrace DEBUG5
-
-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag, \
-		(errmsg(NEON_TAG fmt, ## __VA_ARGS__), \
-		 errhidestmt(true), errhidecontext(true)))
-
-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-char	   *page_server_connstring_raw;
-
-static ZenithResponse *pageserver_call(ZenithRequest *request);
-page_server_api api = {
-	.request = pageserver_call
-};
-
-static void
-pageserver_connect()
-{
-	char	   *query;
-	int			ret;
-
-	Assert(!connected);
-
-	pageserver_conn = PQconnectdb(page_server_connstring);
-
-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
-	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		ereport(ERROR,
-				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
-				 errdetail_internal("%s", msg)));
-	}
-
-	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
-	if (ret != 1)
-	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		neon_log(ERROR, "could not send pagestream command to pageserver");
-	}
-
-	while (PQisBusy(pageserver_conn))
-	{
-		int			wc;
-
-		/* Sleep until there's something to do */
-		wc = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_SOCKET_READABLE |
-							   WL_EXIT_ON_PM_DEATH,
-							   PQsocket(pageserver_conn),
-							   -1L, PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Data available in socket? */
-		if (wc & WL_SOCKET_READABLE)
-		{
-			if (!PQconsumeInput(pageserver_conn))
-			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-
-				neon_log(ERROR, "could not complete handshake with pageserver: %s",
-						 msg);
-			}
-		}
-	}
-
-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
-
-	connected = true;
-}
-
-/*
- * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
- */
-static int
-call_PQgetCopyData(PGconn *conn, char **buffer)
-{
-	int			ret;
-
-retry:
-	ret = PQgetCopyData(conn, buffer, 1 /* async */ );
-
-	if (ret == 0)
-	{
-		int			wc;
-
-		/* Sleep until there's something to do */
-		wc = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_SOCKET_READABLE |
-							   WL_EXIT_ON_PM_DEATH,
-							   PQsocket(conn),
-							   -1L, PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Data available in socket? */
-		if (wc & WL_SOCKET_READABLE)
-		{
-			if (!PQconsumeInput(conn))
-				neon_log(ERROR, "could not get response from pageserver: %s",
-						 PQerrorMessage(conn));
-		}
-
-		goto retry;
-	}
-
-	return ret;
-}
-
-
-static ZenithResponse *
-pageserver_call(ZenithRequest *request)
-{
-	StringInfoData req_buff;
-	StringInfoData resp_buff;
-	ZenithResponse *resp;
-
-	PG_TRY();
-	{
-		/* If the connection was lost for some reason, reconnect */
-		if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
-		{
-			PQfinish(pageserver_conn);
-			pageserver_conn = NULL;
-			connected = false;
-		}
-
-		if (!connected)
-			pageserver_connect();
-
-		req_buff = zm_pack_request(request);
-
-		/*
-		 * Send request.
-		 *
-		 * In principle, this could block if the output buffer is full, and we
-		 * should use async mode and check for interrupts while waiting. In
-		 * practice, our requests are small enough to always fit in the output
-		 * and TCP buffer.
-		 */
-		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
-		{
-			neon_log(ERROR, "failed to send page request: %s",
-					 PQerrorMessage(pageserver_conn));
-		}
-		pfree(req_buff.data);
-
-		if (message_level_is_interesting(PageStoreTrace))
-		{
-			char	   *msg = zm_to_string((ZenithMessage *) request);
-
-			neon_log(PageStoreTrace, "sent request: %s", msg);
-			pfree(msg);
-		}
-
-		/* read response */
-		resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
-		resp_buff.cursor = 0;
-
-		if (resp_buff.len == -1)
-			neon_log(ERROR, "end of COPY");
-		else if (resp_buff.len == -2)
-			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
-
-		resp = zm_unpack_response(&resp_buff);
-		PQfreemem(resp_buff.data);
-
-		if (message_level_is_interesting(PageStoreTrace))
-		{
-			char	   *msg = zm_to_string((ZenithMessage *) resp);
-
-			neon_log(PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
-		}
-	}
-	PG_CATCH();
-	{
-		/*
-		 * If anything goes wrong while we were sending a request, it's not
-		 * clear what state the connection is in. For example, if we sent the
-		 * request but didn't receive a response yet, we might receive the
-		 * response some time later after we have already sent a new unrelated
-		 * request. Close the connection to avoid getting confused.
-		 */
-		if (connected)
-		{
-			neon_log(LOG, "dropping connection to page server due to error");
-			PQfinish(pageserver_conn);
-			pageserver_conn = NULL;
-			connected = false;
-		}
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
-
-	return (ZenithResponse *) resp;
-}
-
-
-static bool
-check_zenith_id(char **newval, void **extra, GucSource source)
-{
-	uint8		zid[16];
-
-	return **newval == '\0' || HexDecodeString(zid, *newval, 16);
-}
-
-static char *
-substitute_pageserver_password(const char *page_server_connstring_raw)
-{
-	char	   *host = NULL;
-	char	   *port = NULL;
-	char	   *user = NULL;
-	char	   *auth_token = NULL;
-	char	   *err = NULL;
-	char	   *page_server_connstring = NULL;
-	PQconninfoOption *conn_options;
-	PQconninfoOption *conn_option;
-	MemoryContext oldcontext;
-
-	/*
-	 * Here we substitute password in connection string with an environment
-	 * variable. To simplify things we construct a connection string back with
-	 * only known options. In particular: host port user and password. We do
-	 * not currently use other options and constructing full connstring in an
-	 * URI shape is quite messy.
-	 */
-
-	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
-		return NULL;
-
-	/* extract the auth token from the connection string */
-	conn_options = PQconninfoParse(page_server_connstring_raw, &err);
-	if (conn_options == NULL)
-	{
-		/* The error string is malloc'd, so we must free it explicitly */
-		char	   *errcopy = err ? pstrdup(err) : "out of memory";
-
-		PQfreemem(err);
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid connection string syntax: %s", errcopy)));
-	}
-
-	/*
-	 * Trying to populate pageserver connection string with auth token from
-	 * environment. We are looking for password in with placeholder value like
-	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
-	 * to fetch environment variable value and fail loudly if it is not set.
-	 */
-	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
-	{
-		if (strcmp(conn_option->keyword, "host") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				host = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "port") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				port = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "user") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				user = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "password") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-			{
-				/* ensure that this is a template */
-				if (strncmp(conn_option->val, "$", 1) != 0)
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
-
-				neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
-				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token)
-				{
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
-				}
-				else
-				{
-					neon_log(LOG, "using auth token from environment passed via env");
-				}
-			}
-		}
-	}
-
-	/*
-	 * allocate connection string in TopMemoryContext to make sure it is not
-	 * freed
-	 */
-	oldcontext = CurrentMemoryContext;
-	MemoryContextSwitchTo(TopMemoryContext);
-	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
-	MemoryContextSwitchTo(oldcontext);
-
-	PQconninfoFree(conn_options);
-	return page_server_connstring;
-}
-
-/*
- * Module initialization function
- */
-void
-pg_init_libpagestore(void)
-{
-	DefineCustomStringVariable("neon.pageserver_connstring",
-							   "connection string to the page server",
-							   NULL,
-							   &page_server_connstring_raw,
-							   "",
-							   PGC_POSTMASTER,
-							   0,	/* no flags required */
-							   NULL, NULL, NULL);
-
-	DefineCustomStringVariable("neon.timeline_id",
-							   "Zenith timelineid the server is running on",
-							   NULL,
-							   &zenith_timeline,
-							   "",
-							   PGC_POSTMASTER,
-							   0,	/* no flags required */
-							   check_zenith_id, NULL, NULL);
-
-	DefineCustomStringVariable("neon.tenant_id",
-							   "Neon tenantid the server is running on",
-							   NULL,
-							   &zenith_tenant,
-							   "",
-							   PGC_POSTMASTER,
-							   0,	/* no flags required */
-							   check_zenith_id, NULL, NULL);
-
-	DefineCustomBoolVariable("neon.wal_redo",
-							 "start in wal-redo mode",
-							 NULL,
-							 &wal_redo,
-							 false,
-							 PGC_POSTMASTER,
-							 0,
-							 NULL, NULL, NULL);
-
-	DefineCustomIntVariable("neon.max_cluster_size",
-							"cluster size limit",
-							NULL,
-							&max_cluster_size,
-							-1, -1, INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_MB,
-							NULL, NULL, NULL);
-
-	relsize_hash_init();
-	EmitWarningsOnPlaceholders("neon");
-
-	if (page_server != NULL)
-		neon_log(ERROR, "libpagestore already loaded");
-
-	neon_log(PageStoreTrace, "libpagestore already loaded");
-	page_server = &api;
-
-	/* substitute password in pageserver_connstring */
-	page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
-
-	/* Is there more correct way to pass CustomGUC to postgres code? */
-	zenith_timeline_walproposer = zenith_timeline;
-	zenith_tenant_walproposer = zenith_tenant;
-
-	if (wal_redo)
-	{
-		neon_log(PageStoreTrace, "set inmem_smgr hook");
-		smgr_hook = smgr_inmem;
-		smgr_init_hook = smgr_init_inmem;
-	}
-	else if (page_server_connstring && page_server_connstring[0])
-	{
-		neon_log(PageStoreTrace, "set neon_smgr hook");
-		smgr_hook = smgr_zenith;
-		smgr_init_hook = smgr_init_zenith;
-		dbsize_hook = zenith_dbsize;
-	}
-}
diff --git a/contrib/neon/libpqwalproposer.c b/contrib/neon/libpqwalproposer.c
deleted file mode 100644
index 2b2b7a1a6a4..00000000000
--- a/contrib/neon/libpqwalproposer.c
+++ /dev/null
@@ -1,413 +0,0 @@
-#include "postgres.h"
-
-#include "libpq-fe.h"
-#include "neon.h"
-#include "walproposer.h"
-
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn* pg_conn;
-	bool    is_nonblocking; /* whether the connection is non-blocking */
-	char   *recvbuf;	/* last received data from libpqprop_async_read */
-};
-
-/* Prototypes for exported functions */
-static char*							libpqprop_error_message(WalProposerConn* conn);
-static WalProposerConnStatusType		libpqprop_status(WalProposerConn* conn);
-static WalProposerConn*					libpqprop_connect_start(char* conninfo);
-static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn* conn);
-static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
-static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
-static pgsocket							libpqprop_socket(WalProposerConn* conn);
-static int								libpqprop_flush(WalProposerConn* conn);
-static void								libpqprop_finish(WalProposerConn* conn);
-static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
-static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
-static bool                             libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size);
-
-static WalProposerFunctionsType PQWalProposerFunctions = {
-	libpqprop_error_message,
-	libpqprop_status,
-	libpqprop_connect_start,
-	libpqprop_connect_poll,
-	libpqprop_send_query,
-	libpqprop_get_query_result,
-	libpqprop_socket,
-	libpqprop_flush,
-	libpqprop_finish,
-	libpqprop_async_read,
-	libpqprop_async_write,
-	libpqprop_blocking_write,
-};
-
-/* Module initialization */
-void
-pg_init_libpqwalproposer(void)
-{
-	if (WalProposerFunctions != NULL)
-		elog(ERROR, "libpqwalproposer already loaded");
-	WalProposerFunctions = &PQWalProposerFunctions;
-}
-
-/* Helper function */
-static bool
-ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking)
-{
-	/* If we're already correctly blocking or nonblocking, all good */
-	if (is_nonblocking == conn->is_nonblocking)
-		return true;
-
-	/* Otherwise, set it appropriately */
-	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
-		return false;
-
-	conn->is_nonblocking = is_nonblocking;
-	return true;
-}
-
-/* Exported function definitions */
-static char*
-libpqprop_error_message(WalProposerConn* conn)
-{
-	return PQerrorMessage(conn->pg_conn);
-}
-
-static WalProposerConnStatusType
-libpqprop_status(WalProposerConn* conn)
-{
-	switch (PQstatus(conn->pg_conn))
-	{
-		case CONNECTION_OK:
-			return WP_CONNECTION_OK;
-		case CONNECTION_BAD:
-			return WP_CONNECTION_BAD;
-		default:
-			return WP_CONNECTION_IN_PROGRESS;
-	}
-}
-
-static WalProposerConn*
-libpqprop_connect_start(char* conninfo)
-{
-	WalProposerConn*	conn;
-	PGconn*				pg_conn;
-
-	pg_conn = PQconnectStart(conninfo);
-	/*
-	 * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the
-	 * behavior of PQconnectStart here.
-	 */
-	if (!pg_conn)
-		return NULL;
-
-	/*
-	 * And in theory this allocation can fail as well, but it's incredibly unlikely if we just
-	 * successfully allocated a PGconn.
-	 *
-	 * palloc will exit on failure though, so there's not much we could do if it *did* fail.
-	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false; /* connections always start in blocking mode */
-	conn->recvbuf = NULL;
-	return conn;
-}
-
-static WalProposerConnectPollStatusType
-libpqprop_connect_poll(WalProposerConn* conn)
-{
-	WalProposerConnectPollStatusType return_val;
-
-	switch (PQconnectPoll(conn->pg_conn))
-	{
-		case PGRES_POLLING_FAILED:
-			return_val = WP_CONN_POLLING_FAILED;
-			break;
-		case PGRES_POLLING_READING:
-			return_val = WP_CONN_POLLING_READING;
-			break;
-		case PGRES_POLLING_WRITING:
-			return_val = WP_CONN_POLLING_WRITING;
-			break;
-		case PGRES_POLLING_OK:
-			return_val = WP_CONN_POLLING_OK;
-			break;
-
-		/* There's a comment at its source about this constant being unused. We'll expect it's never
-		 * returned. */
-		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
-			/* This return is never actually reached, but it's here to make the compiler happy */
-			return WP_CONN_POLLING_FAILED;
-
-		default:
-			Assert(false);
-			return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
-	}
-
-	return return_val;
-}
-
-static bool
-libpqprop_send_query(WalProposerConn* conn, char* query)
-{
-	/* We need to be in blocking mode for sending the query to run without
-	 * requiring a call to PQflush */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
-		return false;
-
-	return true;
-}
-
-static WalProposerExecStatusType
-libpqprop_get_query_result(WalProposerConn* conn)
-{
-	PGresult* result;
-	WalProposerExecStatusType return_val;
-
-	/* Marker variable if we need to log an unexpected success result */
-	char* unexpected_success = NULL;
-
-	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
-		return WP_EXEC_FAILED;
-
-	if (PQisBusy(conn->pg_conn))
-		return WP_EXEC_NEEDS_INPUT;
-
-
-	result = PQgetResult(conn->pg_conn);
-	/* PQgetResult returns NULL only if getting the result was successful & there's no more of the
-	 * result to get. */
-	if (!result)
-	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
-		return WP_EXEC_UNEXPECTED_SUCCESS;
-	}
-
-	/* Helper macro to reduce boilerplate */
-	#define UNEXPECTED_SUCCESS(msg) \
-		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
-		unexpected_success = msg; \
-		break;
-
-
-	switch (PQresultStatus(result))
-	{
-		/* "true" success case */
-		case PGRES_COPY_BOTH:
-			return_val = WP_EXEC_SUCCESS_COPYBOTH;
-			break;
-
-		/* Unexpected success case */
-		case PGRES_EMPTY_QUERY:
-			UNEXPECTED_SUCCESS("empty query return");
-		case PGRES_COMMAND_OK:
-			UNEXPECTED_SUCCESS("data-less command end");
-		case PGRES_TUPLES_OK:
-			UNEXPECTED_SUCCESS("tuples return");
-		case PGRES_COPY_OUT:
-			UNEXPECTED_SUCCESS("'Copy Out' response");
-		case PGRES_COPY_IN:
-			UNEXPECTED_SUCCESS("'Copy In' response");
-		case PGRES_SINGLE_TUPLE:
-			UNEXPECTED_SUCCESS("single tuple return");
-		case PGRES_PIPELINE_SYNC:
-			UNEXPECTED_SUCCESS("pipeline sync point");
-
-		/* Failure cases */
-		case PGRES_BAD_RESPONSE:
-		case PGRES_NONFATAL_ERROR:
-		case PGRES_FATAL_ERROR:
-		case PGRES_PIPELINE_ABORTED:
-			return_val = WP_EXEC_FAILED;
-			break;
-
-		default:
-			Assert(false);
-			return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
-	}
-
-	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
-
-	return return_val;
-}
-
-static pgsocket
-libpqprop_socket(WalProposerConn* conn)
-{
-	return PQsocket(conn->pg_conn);
-}
-
-static int
-libpqprop_flush(WalProposerConn* conn)
-{
-	return (PQflush(conn->pg_conn));
-}
-
-static void
-libpqprop_finish(WalProposerConn* conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
-{
-	int result;
-
-	if (conn->recvbuf != NULL)
-	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
-	}
-
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
-	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
-	}
-
-	/* The docs for PQgetCopyData list the return values as:
-	 *      0 if the copy is still in progress, but no "complete row" is
-	 *        available
-	 *     -1 if the copy is done
-	 *     -2 if an error occured
-	 *  (> 0) if it was successful; that value is the amount transferred.
-	 *
-	 * The protocol we use between walproposer and safekeeper means that we
-	 * *usually* wouldn't expect to see that the copy is done, but this can
-	 * sometimes be triggered by the server returning an ErrorResponse (which
-	 * also happens to have the effect that the copy is done).
-	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
-	{
-		case 0:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_TRY_AGAIN;
-		case -1:
-		{
-			/*
-			 * If we get -1, it's probably because of a server error; the
-			 * safekeeper won't normally send a CopyDone message.
-			 *
-			 * We can check PQgetResult to make sure that the server failed;
-			 * it'll always result in PGRES_FATAL_ERROR
-			 */
-			ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
-
-			if (status != PGRES_FATAL_ERROR)
-				elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
-
-			/* If there was actually an error, it'll be properly reported by
-			 * calls to PQerrorMessage -- we don't have to do anything else */
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_FAIL;
-		}
-		case -2:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_FAIL;
-		default:
-			/* Positive values indicate the size of the returned result */
-			*amount = result;
-			*buf = conn->recvbuf;
-			return PG_ASYNC_READ_SUCCESS;
-	}
-}
-
-static PGAsyncWriteResult
-libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
-{
-	int result;
-
-	/* If we aren't in non-blocking mode, switch to it. */
-	if (!ensure_nonblocking_status(conn, true))
-		return PG_ASYNC_WRITE_FAIL;
-
-	/* The docs for PQputcopyData list the return values as:
-	 *   1 if the data was queued,
-	 *   0 if it was not queued because of full buffers, or
-	 *  -1 if an error occured
-	 */
-	result = PQputCopyData(conn->pg_conn, buf, size);
-
-	/* We won't get a result of zero because walproposer always empties the
-	 * connection's buffers before sending more */
-	Assert(result != 0);
-
-	switch (result)
-	{
-		case 1:
-			/* good -- continue */
-			break;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
-	}
-
-	/* After queueing the data, we still need to flush to get it to send.
-	 * This might take multiple tries, but we don't want to wait around
-	 * until it's done.
-	 *
-	 * PQflush has the following returns (directly quoting the docs):
-	 *   0 if sucessful,
-	 *   1 if it was unable to send all the data in the send queue yet
-	 *  -1 if it failed for some reason
-	 */
-	switch (result = PQflush(conn->pg_conn)) {
-		case 0:
-			return PG_ASYNC_WRITE_SUCCESS;
-		case 1:
-			return PG_ASYNC_WRITE_TRY_FLUSH;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
-	}
-}
-
-static bool
-libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size)
-{
-	int result;
-
-	/* If we are in non-blocking mode, switch out of it. */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	/* Ths function is very similar to libpqprop_async_write. For more
-	 * information, refer to the comments there */
-	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
-		return false;
-
-	Assert(result == 1);
-
-	/* Because the connection is non-blocking, flushing returns 0 or -1 */
-
-	if ((result = PQflush(conn->pg_conn)) == -1)
-		return false;
-
-	Assert(result == 0);
-	return true;
-}
diff --git a/contrib/neon/neon--1.0.sql b/contrib/neon/neon--1.0.sql
deleted file mode 100644
index 34f1ba78d4f..00000000000
--- a/contrib/neon/neon--1.0.sql
+++ /dev/null
@@ -1,17 +0,0 @@
-\echo Use "CREATE EXTENSION neon" to load this file. \quit
-
-CREATE FUNCTION pg_cluster_size()
-RETURNS bigint
-AS 'MODULE_PATHNAME', 'pg_cluster_size'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
-
-CREATE FUNCTION backpressure_lsns(
-    OUT received_lsn pg_lsn,
-    OUT disk_consistent_lsn pg_lsn,
-    OUT remote_consistent_lsn pg_lsn
-)
-RETURNS record
-AS 'MODULE_PATHNAME', 'backpressure_lsns'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
diff --git a/contrib/neon/neon.c b/contrib/neon/neon.c
deleted file mode 100644
index 94ff9851eae..00000000000
--- a/contrib/neon/neon.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * neon.c
- *	  Utility functions to expose neon specific information to user
- *
- * IDENTIFICATION
- *	 contrib/neon/neon.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-#include "fmgr.h"
-
-#include "access/xact.h"
-#include "access/xlog.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "catalog/pg_type.h"
-#include "replication/walsender.h"
-#include "funcapi.h"
-#include "access/htup_details.h"
-#include "utils/pg_lsn.h"
-
-#include "neon.h"
-#include "walproposer.h"
-
-PG_MODULE_MAGIC;
-void		_PG_init(void);
-
-
-void		_PG_init(void)
-{
-	pg_init_libpagestore();
-	pg_init_libpqwalproposer();
-	pg_init_walproposer();
-}
-
-PG_FUNCTION_INFO_V1(pg_cluster_size);
-PG_FUNCTION_INFO_V1(backpressure_lsns);
-
-Datum
-pg_cluster_size(PG_FUNCTION_ARGS)
-{
-	int64		size;
-
-	size = GetZenithCurrentClusterSize();
-
-	if (size == 0)
-		PG_RETURN_NULL();
-
-	PG_RETURN_INT64(size);
-}
-
-
-Datum
-backpressure_lsns(PG_FUNCTION_ARGS)
-{
-	XLogRecPtr writePtr;
-	XLogRecPtr flushPtr;
-	XLogRecPtr applyPtr;
-	Datum		values[3];
-	bool		nulls[3];
-	TupleDesc	tupdesc;
-
-	replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
-
-	tupdesc = CreateTemplateTupleDesc(3);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0);
-	tupdesc = BlessTupleDesc(tupdesc);
-
-	MemSet(nulls, 0, sizeof(nulls));
-	values[0] = LSNGetDatum(writePtr);
-	values[1] = LSNGetDatum(flushPtr);
-	values[2] = LSNGetDatum(applyPtr);
-
-	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
-}
diff --git a/contrib/neon/neon.control b/contrib/neon/neon.control
deleted file mode 100644
index 84f79881c1e..00000000000
--- a/contrib/neon/neon.control
+++ /dev/null
@@ -1,4 +0,0 @@
-# neon extension
-comment = 'cloud storage for PostgreSQL'
-default_version = '1.0'
-module_pathname = '$libdir/neon'
diff --git a/contrib/neon/neon.h b/contrib/neon/neon.h
deleted file mode 100644
index 2c66bc7bf05..00000000000
--- a/contrib/neon/neon.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * neon.h
- *	  Functions used in the initialization of this extension.
- *
- * IDENTIFICATION
- *	 contrib/neon/neon.h
- *
- *-------------------------------------------------------------------------
- */
-
-#ifndef NEON_H
-#define NEON_H
-
-extern void pg_init_libpagestore(void);
-extern void pg_init_libpqwalproposer(void);
-extern void pg_init_walproposer(void);
-
-#endif /* NEON_H */
diff --git a/contrib/neon/pagestore_client.h b/contrib/neon/pagestore_client.h
deleted file mode 100644
index f79a3c9142f..00000000000
--- a/contrib/neon/pagestore_client.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * pagestore_client.h
- *
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * contrib/neon/pagestore_client.h
- *
- *-------------------------------------------------------------------------
- */
-#ifndef pageserver_h
-#define pageserver_h
-
-#include "postgres.h"
-
-#include "access/xlogdefs.h"
-#include "storage/relfilenode.h"
-#include "storage/block.h"
-#include "storage/smgr.h"
-#include "lib/stringinfo.h"
-#include "libpq/pqformat.h"
-#include "utils/memutils.h"
-
-#include "pg_config.h"
-
-typedef enum
-{
-	/* pagestore_client -> pagestore */
-	T_ZenithExistsRequest = 0,
-	T_ZenithNblocksRequest,
-	T_ZenithGetPageRequest,
-	T_ZenithDbSizeRequest,
-
-	/* pagestore -> pagestore_client */
-	T_ZenithExistsResponse = 100,
-	T_ZenithNblocksResponse,
-	T_ZenithGetPageResponse,
-	T_ZenithErrorResponse,
-	T_ZenithDbSizeResponse,
-} ZenithMessageTag;
-
-
-
-/* base struct for c-style inheritance */
-typedef struct
-{
-	ZenithMessageTag tag;
-} ZenithMessage;
-
-#define messageTag(m)		(((const ZenithMessage *)(m))->tag)
-
-/*
- * supertype of all the Zenith*Request structs below
- *
- * If 'latest' is true, we are requesting the latest page version, and 'lsn'
- * is just a hint to the server that we know there are no versions of the page
- * (or relation size, for exists/nblocks requests) later than the 'lsn'.
- */
-typedef struct
-{
-	ZenithMessageTag tag;
-	bool		latest;			/* if true, request latest page version */
-	XLogRecPtr	lsn;			/* request page version @ this LSN */
-} ZenithRequest;
-
-typedef struct
-{
-	ZenithRequest req;
-	RelFileNode rnode;
-	ForkNumber	forknum;
-} ZenithExistsRequest;
-
-typedef struct
-{
-	ZenithRequest req;
-	RelFileNode rnode;
-	ForkNumber	forknum;
-} ZenithNblocksRequest;
-
-
-typedef struct
-{
-	ZenithRequest req;
-	Oid dbNode;
-} ZenithDbSizeRequest;
-
-
-typedef struct
-{
-	ZenithRequest req;
-	RelFileNode rnode;
-	ForkNumber	forknum;
-	BlockNumber blkno;
-} ZenithGetPageRequest;
-
-/* supertype of all the Zenith*Response structs below */
-typedef struct
-{
-	ZenithMessageTag tag;
-} ZenithResponse;
-
-typedef struct
-{
-	ZenithMessageTag tag;
-	bool		exists;
-} ZenithExistsResponse;
-
-typedef struct
-{
-	ZenithMessageTag tag;
-	uint32		n_blocks;
-} ZenithNblocksResponse;
-
-typedef struct
-{
-	ZenithMessageTag tag;
-	char		page[FLEXIBLE_ARRAY_MEMBER];
-} ZenithGetPageResponse;
-
-typedef struct
-{
-	ZenithMessageTag tag;
-	int64		db_size;
-} ZenithDbSizeResponse;
-
-typedef struct
-{
-	ZenithMessageTag tag;
-	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */
-} ZenithErrorResponse;
-
-extern StringInfoData zm_pack_request(ZenithRequest *msg);
-extern ZenithResponse *zm_unpack_response(StringInfo s);
-extern char *zm_to_string(ZenithMessage *msg);
-
-/*
- * API
- */
-
-typedef struct
-{
-	ZenithResponse *(*request) (ZenithRequest *request);
-} page_server_api;
-
-extern page_server_api *page_server;
-
-extern char *page_server_connstring;
-extern char *zenith_timeline;
-extern char *zenith_tenant;
-extern bool wal_redo;
-extern int32 max_cluster_size;
-
-extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
-extern void smgr_init_zenith(void);
-
-extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
-extern void smgr_init_inmem(void);
-extern void smgr_shutdown_inmem(void);
-
-/* zenith storage manager functionality */
-
-extern void zenith_init(void);
-extern void zenith_open(SMgrRelation reln);
-extern void zenith_close(SMgrRelation reln, ForkNumber forknum);
-extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum);
-extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void zenith_extend(SMgrRelation reln, ForkNumber forknum,
-						  BlockNumber blocknum, char *buffer, bool skipFsync);
-extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
-							BlockNumber blocknum);
-extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-						char *buffer);
-
-extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
-			XLogRecPtr request_lsn, bool request_latest, char *buffer);
-
-extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
-							 BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
-extern const int64 zenith_dbsize(Oid dbNode);
-extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
-							BlockNumber nblocks);
-extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
-
-/* zenith wal-redo storage manager functionality */
-
-extern void inmem_init(void);
-extern void inmem_open(SMgrRelation reln);
-extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, char *buffer, bool skipFsync);
-extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum);
-extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					   char *buffer);
-extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
-						BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
-							BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber nblocks);
-extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-
-
-/* utils for zenith relsize cache */
-extern void relsize_hash_init(void);
-extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
-extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
-extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
-extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
-
-#endif
diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
deleted file mode 100644
index 3e1b74dba7c..00000000000
--- a/contrib/neon/pagestore_smgr.c
+++ /dev/null
@@ -1,1696 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * pagestore_smgr.c
- *
- *
- *
- * Temporary and unlogged rels
- * ---------------------------
- *
- * Temporary and unlogged tables are stored locally, by md.c. The functions
- * here just pass the calls through to corresponding md.c functions.
- *
- * Index build operations that use the buffer cache are also handled locally,
- * just like unlogged tables. Such operations must be marked by calling
- * smgr_start_unlogged_build() and friends.
- *
- * In order to know what relations are permanent and which ones are not, we
- * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
- * by smgropen() callers, when they have the relcache entry at hand.  However,
- * sometimes we need to open an SmgrRelation for a relation without the
- * relcache. That is needed when we evict a buffer; we might not have the
- * SmgrRelation for that relation open yet. To deal with that, the
- * 'relpersistence' can be left to zero, meaning we don't know if it's
- * permanent or not. Most operations are not allowed with relpersistence==0,
- * but smgrwrite() does work, which is what we need for buffer eviction.  and
- * smgrunlink() so that a backend doesn't need to have the relcache entry at
- * transaction commit, where relations that were dropped in the transaction
- * are unlinked.
- *
- * If smgrwrite() is called and smgr_relpersistence == 0, we check if the
- * relation file exists locally or not. If it does exist, we assume it's an
- * unlogged relation and write the page there. Otherwise it must be a
- * permanent relation, WAL-logged and stored on the page server, and we ignore
- * the write like we do for permanent relations.
- *
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  contrib/neon/pagestore_smgr.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "access/xact.h"
-#include "access/xlog.h"
-#include "access/xloginsert.h"
-#include "access/xlog_internal.h"
-#include "catalog/pg_class.h"
-#include "pagestore_client.h"
-#include "pagestore_client.h"
-#include "storage/smgr.h"
-#include "access/xlogdefs.h"
-#include "postmaster/interrupt.h"
-#include "replication/walsender.h"
-#include "storage/bufmgr.h"
-#include "storage/md.h"
-#include "fmgr.h"
-#include "miscadmin.h"
-#include "pgstat.h"
-#include "catalog/pg_tablespace_d.h"
-#include "postmaster/autovacuum.h"
-
-/*
- * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
- * calls to md.c, and *also* do the calls to the Page Server. On every
- * read, compare the versions we read from local disk and Page Server,
- * and Assert that they are identical.
- */
-/* #define DEBUG_COMPARE_LOCAL */
-
-#ifdef DEBUG_COMPARE_LOCAL
-#include "access/nbtree.h"
-#include "storage/bufpage.h"
-#include "access/xlog_internal.h"
-
-static char *hexdump_page(char *page);
-#endif
-
-#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
-
-const int	SmgrTrace = DEBUG5;
-
-page_server_api *page_server;
-
-/* GUCs */
-char	   *page_server_connstring; // with substituted password
-char	   *zenith_timeline;
-char	   *zenith_tenant;
-bool		wal_redo = false;
-int32		max_cluster_size;
-
-/* unlogged relation build states */
-typedef enum
-{
-	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
-	UNLOGGED_BUILD_PHASE_1,
-	UNLOGGED_BUILD_PHASE_2,
-	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
-
-static SMgrRelation unlogged_build_rel = NULL;
-static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-
-StringInfoData
-zm_pack_request(ZenithRequest *msg)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-	pq_sendbyte(&s, msg->tag);
-
-	switch (messageTag(msg))
-	{
-			/* pagestore_client -> pagestore */
-		case T_ZenithExistsRequest:
-			{
-				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
-
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->rnode.spcNode);
-				pq_sendint32(&s, msg_req->rnode.dbNode);
-				pq_sendint32(&s, msg_req->rnode.relNode);
-				pq_sendbyte(&s, msg_req->forknum);
-
-				break;
-			}
-		case T_ZenithNblocksRequest:
-			{
-				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
-
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->rnode.spcNode);
-				pq_sendint32(&s, msg_req->rnode.dbNode);
-				pq_sendint32(&s, msg_req->rnode.relNode);
-				pq_sendbyte(&s, msg_req->forknum);
-
-				break;
-			}
-		case T_ZenithDbSizeRequest:
-			{
-				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
-
-					pq_sendbyte(&s, msg_req->req.latest);
-					pq_sendint64(&s, msg_req->req.lsn);
-					pq_sendint32(&s, msg_req->dbNode);
-
-					break;
-			}
-		case T_ZenithGetPageRequest:
-			{
-				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
-
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->rnode.spcNode);
-				pq_sendint32(&s, msg_req->rnode.dbNode);
-				pq_sendint32(&s, msg_req->rnode.relNode);
-				pq_sendbyte(&s, msg_req->forknum);
-				pq_sendint32(&s, msg_req->blkno);
-
-				break;
-			}
-
-			/* pagestore -> pagestore_client. We never need to create these. */
-		case T_ZenithExistsResponse:
-		case T_ZenithNblocksResponse:
-		case T_ZenithGetPageResponse:
-		case T_ZenithErrorResponse:
-		case T_ZenithDbSizeResponse:
-		default:
-			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
-			break;
-	}
-	return s;
-}
-
-ZenithResponse *
-zm_unpack_response(StringInfo s)
-{
-	ZenithMessageTag tag = pq_getmsgbyte(s);
-	ZenithResponse *resp = NULL;
-
-	switch (tag)
-	{
-			/* pagestore -> pagestore_client */
-		case T_ZenithExistsResponse:
-			{
-				ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
-
-				msg_resp->tag = tag;
-				msg_resp->exists = pq_getmsgbyte(s);
-				pq_getmsgend(s);
-
-				resp = (ZenithResponse *) msg_resp;
-				break;
-			}
-
-		case T_ZenithNblocksResponse:
-			{
-				ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
-
-				msg_resp->tag = tag;
-				msg_resp->n_blocks = pq_getmsgint(s, 4);
-				pq_getmsgend(s);
-
-				resp = (ZenithResponse *) msg_resp;
-				break;
-			}
-
-		case T_ZenithGetPageResponse:
-			{
-				ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
-
-				msg_resp->tag = tag;
-				/* XXX:	should be varlena */
-				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
-				pq_getmsgend(s);
-
-				resp = (ZenithResponse *) msg_resp;
-				break;
-			}
-
-		case T_ZenithDbSizeResponse:
-			{
-				ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse));
-
-				msg_resp->tag = tag;
-				msg_resp->db_size = pq_getmsgint64(s);
-				pq_getmsgend(s);
-
-				resp = (ZenithResponse *) msg_resp;
-				break;
-			}
-
-		case T_ZenithErrorResponse:
-			{
-				ZenithErrorResponse *msg_resp;
-				size_t		msglen;
-				const char *msgtext;
-
-				msgtext = pq_getmsgrawstring(s);
-				msglen = strlen(msgtext);
-
-				msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
-				msg_resp->tag = tag;
-				memcpy(msg_resp->message, msgtext, msglen + 1);
-				pq_getmsgend(s);
-
-				resp = (ZenithResponse *) msg_resp;
-				break;
-			}
-
-			/*
-			 * pagestore_client -> pagestore
-			 *
-			 * We create these ourselves, and don't need to decode them.
-			 */
-		case T_ZenithExistsRequest:
-		case T_ZenithNblocksRequest:
-		case T_ZenithGetPageRequest:
-		case T_ZenithDbSizeRequest:
-		default:
-			elog(ERROR, "unexpected zenith message tag 0x%02x", tag);
-			break;
-	}
-
-	return resp;
-}
-
-/* dump to json for debugging / error reporting purposes */
-char *
-zm_to_string(ZenithMessage *msg)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	switch (messageTag(msg))
-	{
-			/* pagestore_client -> pagestore */
-		case T_ZenithExistsRequest:
-			{
-				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
-				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 msg_req->rnode.spcNode,
-								 msg_req->rnode.dbNode,
-								 msg_req->rnode.relNode);
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-		case T_ZenithNblocksRequest:
-			{
-				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
-				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 msg_req->rnode.spcNode,
-								 msg_req->rnode.dbNode,
-								 msg_req->rnode.relNode);
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-		case T_ZenithGetPageRequest:
-			{
-				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
-				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 msg_req->rnode.spcNode,
-								 msg_req->rnode.dbNode,
-								 msg_req->rnode.relNode);
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_ZenithDbSizeRequest:
-			{
-				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\"");
-				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-
-			/* pagestore -> pagestore_client */
-		case T_ZenithExistsResponse:
-			{
-				ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
-				appendStringInfo(&s, ", \"exists\": %d}",
-								 msg_resp->exists
-					);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_ZenithNblocksResponse:
-			{
-				ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
-				appendStringInfo(&s, ", \"n_blocks\": %u}",
-								 msg_resp->n_blocks
-					);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_ZenithGetPageResponse:
-			{
-#if 0
-				ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
-#endif
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
-				appendStringInfo(&s, ", \"page\": \"XXX\"}");
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_ZenithErrorResponse:
-			{
-				ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
-
-				/* FIXME: escape double-quotes in the message */
-				appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
-				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_ZenithDbSizeResponse:
-			{
-				ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
-								 msg_resp->db_size
-					);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-
-		default:
-			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
-	}
-	return s.data;
-}
-
-/*
- * Wrapper around log_newpage() that makes a temporary copy of the block and
- * WAL-logs that. This makes it safe to use while holding only a shared lock
- * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint
- * directly because it skips the logging if the LSN is new enough.
- */
-static XLogRecPtr
-log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
-				 Page page, bool page_std)
-{
-	PGAlignedBlock copied_buffer;
-
-	memcpy(copied_buffer.data, page, BLCKSZ);
-	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
-}
-
-/*
- * Is 'buffer' identical to a freshly initialized empty heap page?
- */
-static bool
-PageIsEmptyHeapPage(char *buffer)
-{
-	PGAlignedBlock empty_page;
-
-	PageInit((Page) empty_page.data, BLCKSZ, 0);
-
-	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
-}
-
-static void
-zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
-{
-	XLogRecPtr	lsn = PageGetLSN(buffer);
-
-	if (ShutdownRequestPending)
-		return;
-
-	/*
-	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
-	 * changes are not WAL-logged when the changes are made, so this is our
-	 * last chance to log them, otherwise they're lost. That's OK for
-	 * correctness, the non-logged updates are not critical. But we want to
-	 * have a reasonably up-to-date VM and FSM in the page server.
-	 */
-	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
-	{
-		/* FSM is never WAL-logged and we don't care. */
-		XLogRecPtr	recptr;
-
-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
-		XLogFlush(recptr);
-		lsn = recptr;
-		ereport(SmgrTrace,
-				(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
-						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}
-	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
-	{
-		/*
-		 * Always WAL-log vm. We should never miss clearing visibility map
-		 * bits.
-		 *
-		 * TODO Is it too bad for performance? Hopefully we do not evict
-		 * actively used vm too often.
-		 */
-		XLogRecPtr	recptr;
-
-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
-		XLogFlush(recptr);
-		lsn = recptr;
-
-		ereport(SmgrTrace,
-				(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
-						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}
-	else if (lsn == InvalidXLogRecPtr)
-	{
-		/*
-		 * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages,
-		 * and we can just ignore that in Zenith. We do need to remember the new size,
-		 * though, so that smgrnblocks() returns the right answer after the rel has
-		 * been extended. We rely on the relsize cache for that.
-		 *
-		 * A completely empty heap page doesn't need to be WAL-logged, either. The
-		 * heapam can leave such a page behind, if e.g. an insert errors out after
-		 * initializing the page, but before it has inserted the tuple and WAL-logged
-		 * the change. When we read the page from the page server, it will come back
-		 * as all-zeros. That's OK, the heapam will initialize an all-zeros page on
-		 * first use.
-		 *
-		 * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies
-		 * that the page was not WAL-logged, and its contents will be lost when it's
-		 * evicted.
-		 */
-		if (PageIsNew(buffer))
-		{
-			ereport(SmgrTrace,
-					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
-							blocknum,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
-							forknum)));
-		}
-		else if (PageIsEmptyHeapPage(buffer))
-		{
-			ereport(SmgrTrace,
-					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
-							blocknum,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
-							forknum)));
-		}
-		else
-		{
-			ereport(PANIC,
-					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
-							blocknum,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
-							forknum)));
-		}
-	}
-	else
-	{
-		ereport(SmgrTrace,
-				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
-						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}
-
-	/*
-	 * Remember the LSN on this page. When we read the page again, we must
-	 * read the same or newer version of it.
-	 */
-	SetLastWrittenPageLSN(lsn);
-}
-
-
-/*
- *	zenith_init() -- Initialize private state
- */
-void
-zenith_init(void)
-{
-	/* noop */
-#ifdef DEBUG_COMPARE_LOCAL
-	mdinit();
-#endif
-}
-
-/*
- * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position
- * to physical position in WAL. It always adds SizeOfXLogShortPHD:
- *		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
- * so even if there are no records on the page, offset will be SizeOfXLogShortPHD.
- * It may cause problems with XLogFlush. So return pointer backward to the origin of the page.
- */
-static XLogRecPtr
-zm_adjust_lsn(XLogRecPtr lsn)
-{
-	/*
-	 * If lsn points to the beging of first record on page or segment, then
-	 * "return" it back to the page origin
-	 */
-	if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD)
-	{
-		lsn -= SizeOfXLogShortPHD;
-	}
-	else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD)
-	{
-		lsn -= SizeOfXLogLongPHD;
-	}
-	return lsn;
-}
-
-/*
- * Return LSN for requesting pages and number of blocks from page server
- */
-static XLogRecPtr
-zenith_get_request_lsn(bool *latest)
-{
-	XLogRecPtr	lsn;
-
-	if (RecoveryInProgress())
-	{
-		*latest = false;
-		lsn = GetXLogReplayRecPtr(NULL);
-		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-	}
-	else if (am_walsender)
-	{
-		*latest = true;
-		lsn = InvalidXLogRecPtr;
-		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
-	}
-	else
-	{
-		XLogRecPtr	flushlsn;
-
-		/*
-		 * Use the latest LSN that was evicted from the buffer cache. Any
-		 * pages modified by later WAL records must still in the buffer cache,
-		 * so our request cannot concern those.
-		 */
-		*latest = true;
-		lsn = GetLastWrittenPageLSN();
-		Assert(lsn != InvalidXLogRecPtr);
-		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-
-		lsn = zm_adjust_lsn(lsn);
-
-		/*
-		 * Is it possible that the last-written LSN is ahead of last flush
-		 * LSN? Generally not, we shouldn't evict a page from the buffer cache
-		 * before all its modifications have been safely flushed. That's the
-		 * "WAL before data" rule. However, such case does exist at index building,
-		 * _bt_blwritepage logs the full page without flushing WAL before
-		 * smgrextend (files are fsynced before build ends).
-		 */
-		flushlsn = GetFlushRecPtr();
-		if (lsn > flushlsn)
-		{
-			elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-				 (uint32) (lsn >> 32), (uint32) lsn,
-				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
-			XLogFlush(lsn);
-		}
-	}
-
-	return lsn;
-}
-
-
-/*
- *	zenith_exists() -- Does the physical file exist?
- */
-bool
-zenith_exists(SMgrRelation reln, ForkNumber forkNum)
-{
-	bool		exists;
-	ZenithResponse *resp;
-	BlockNumber n_blocks;
-	bool		latest;
-	XLogRecPtr	request_lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			/*
-			 * We don't know if it's an unlogged rel stored locally, or permanent
-			 * rel stored in the page server. First check if it exists locally.
-			 * If it does, great. Otherwise check if it exists in the page server.
-			 */
-			if (mdexists(reln, forkNum))
-				return true;
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			return mdexists(reln, forkNum);
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
-	{
-		return true;
-	}
-
-	/*
-	 * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
-	 * will error out if you check that, because the whole dbdir for tablespace
-	 * 0, db 0 doesn't exists. We possibly should change the page server to
-	 * accept that and return 'false', to be consistent with mdexists(). But
-	 * we probably also should fix pg_table_size() to not call smgrexists()
-	 * with bogus relfilenode.
-	 *
-	 * For now, handle that special case here.
-	 */
-	if (reln->smgr_rnode.node.spcNode == 0 &&
-		reln->smgr_rnode.node.dbNode == 0 &&
-		reln->smgr_rnode.node.relNode == 0)
-	{
-		return false;
-	}
-
-	request_lsn = zenith_get_request_lsn(&latest);
-	{
-		ZenithExistsRequest request = {
-			.req.tag = T_ZenithExistsRequest,
-			.req.latest = latest,
-			.req.lsn = request_lsn,
-			.rnode = reln->smgr_rnode.node,
-			.forknum = forkNum
-		};
-
-		resp = page_server->request((ZenithRequest *) &request);
-	}
-
-	switch (resp->tag)
-	{
-		case T_ZenithExistsResponse:
-			exists = ((ZenithExistsResponse *) resp)->exists;
-			break;
-
-		case T_ZenithErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
-							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s",
-							   ((ZenithErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
-	}
-	pfree(resp);
-	return exists;
-}
-
-/*
- *	zenith_create() -- Create a new relation on zenithd storage
- *
- * If isRedo is true, it's okay for the relation to exist already.
- */
-void
-zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
-{
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdcreate(reln, forkNum, isRedo);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
-		 forkNum);
-
-	/*
-	 * Newly created relation is empty, remember that in the relsize cache.
-	 *
-	 * FIXME: This is currently not just an optimization, but required for
-	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
-	 * relation created, so if we didn't remember the size in the relsize
-	 * cache, we might call smgrnblocks() on the newly-created relation before
-	 * the creation WAL record hass been received by the page server.
-	 */
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdcreate(reln, forkNum, isRedo);
-#endif
-}
-
-/*
- *	zenith_unlink() -- Unlink a relation.
- *
- * Note that we're passed a RelFileNodeBackend --- by the time this is called,
- * there won't be an SMgrRelation hashtable entry anymore.
- *
- * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
- * to delete all forks.
- *
- *
- * If isRedo is true, it's unsurprising for the relation to be already gone.
- * Also, we should remove the file immediately instead of queuing a request
- * for later, since during redo there's no possibility of creating a
- * conflicting relation.
- *
- * Note: any failure should be reported as WARNING not ERROR, because
- * we are usually not in a transaction anymore when this is called.
- */
-void
-zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
-{
-	/*
-	 * Might or might not exist locally, depending on whether it's
-	 * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is
-	 * set). Try to unlink, it won't do any harm if the file doesn't
-	 * exist.
-	 */
-	mdunlink(rnode, forkNum, isRedo);
-	if (!RelFileNodeBackendIsTemp(rnode)) {
-		forget_cached_relsize(rnode.node, forkNum);
-	}
-}
-
-/*
- *	zenith_extend() -- Add a block to the specified relation.
- *
- *		The semantics are nearly the same as mdwrite(): write at the
- *		specified position.  However, this is to be used for the case of
- *		extending a relation (i.e., blocknum is at or beyond the current
- *		EOF).  Note that we assume writing a block beyond current EOF
- *		causes intervening file space to become filled with zeroes.
- */
-void
-zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
-			  char *buffer, bool skipFsync)
-{
-	XLogRecPtr	lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdextend(reln, forkNum, blkno, buffer, skipFsync);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	/*
-	 * Check that the cluster size limit has not been exceeded.
-	 *
-	 * Temporary and unlogged relations are not included in the cluster size measured
-	 * by the page server, so ignore those. Autovacuum processes are also exempt.
-	 */
-	if (max_cluster_size > 0 &&
-		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
-		!IsAutoVacuumWorkerProcess())
-	{
-		uint64		current_size = GetZenithCurrentClusterSize();
-
-		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
-			ereport(ERROR,
-				(errcode(ERRCODE_DISK_FULL),
-					errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
-						   max_cluster_size),
-					errhint("This limit is defined by neon.max_cluster_size GUC")));
-	}
-
-	zenith_wallog_page(reln, forkNum, blkno, buffer);
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
-
-	lsn = PageGetLSN(buffer);
-	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
-		 forkNum, blkno,
-		 (uint32) (lsn >> 32), (uint32) lsn);
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdextend(reln, forkNum, blkno, buffer, skipFsync);
-#endif
-}
-
-/*
- *  zenith_open() -- Initialize newly-opened relation.
- */
-void
-zenith_open(SMgrRelation reln)
-{
-	/*
-	 * We don't have anything special to do here. Call mdopen() to let md.c
-	 * initialize itself. That's only needed for temporary or unlogged
-	 * relations, but it's dirt cheap so do it always to make sure the md
-	 * fields are initialized, for debugging purposes if nothing else.
-	 */
-	mdopen(reln);
-
-	/* no work */
-	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
-}
-
-/*
- *	zenith_close() -- Close the specified relation, if it isn't closed already.
- */
-void
-zenith_close(SMgrRelation reln, ForkNumber forknum)
-{
-	/*
-	 * Let md.c close it, if it had it open. Doesn't hurt to do this
-	 * even for permanent relations that have no local storage.
-	 */
-	mdclose(reln, forknum);
-}
-
-/*
- *	zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation
- */
-bool
-zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
-{
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			/* probably shouldn't happen, but ignore it */
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			return mdprefetch(reln, forknum, blocknum);
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	/* not implemented */
-	elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop");
-	return true;
-}
-
-/*
- * zenith_writeback() -- Tell the kernel to write pages back to storage.
- *
- * This accepts a range of blocks because flushing several pages at once is
- * considerably more efficient than doing so individually.
- */
-void
-zenith_writeback(SMgrRelation reln, ForkNumber forknum,
-				 BlockNumber blocknum, BlockNumber nblocks)
-{
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			/* mdwriteback() does nothing if the file doesn't exist */
-			mdwriteback(reln, forknum, blocknum, nblocks);
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdwriteback(reln, forknum, blocknum, nblocks);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	/* not implemented */
-	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdwriteback(reln, forknum, blocknum, nblocks);
-#endif
-}
-
-/*
- * While function is defined in the zenith extension it's used within neon_test_utils directly.
- * To avoid breaking tests in the runtime please keep function signature in sync.
- */
-void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
-			XLogRecPtr request_lsn, bool request_latest, char *buffer)
-{
-	ZenithResponse *resp;
-
-	{
-		ZenithGetPageRequest request = {
-			.req.tag = T_ZenithGetPageRequest,
-			.req.latest = request_latest,
-			.req.lsn = request_lsn,
-			.rnode = rnode,
-			.forknum = forkNum,
-			.blkno = blkno
-		};
-
-		resp = page_server->request((ZenithRequest *) &request);
-	}
-
-	switch (resp->tag)
-	{
-		case T_ZenithGetPageResponse:
-			memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ);
-			break;
-
-		case T_ZenithErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							blkno,
-							rnode.spcNode,
-							rnode.dbNode,
-							rnode.relNode,
-							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s",
-							   ((ZenithErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
-	}
-
-	pfree(resp);
-}
-
-/*
- *	zenith_read() -- Read the specified block from a relation.
- */
-void
-zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
-			char *buffer)
-{
-	bool		latest;
-	XLogRecPtr	request_lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdread(reln, forkNum, blkno, buffer);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	request_lsn = zenith_get_request_lsn(&latest);
-	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
-	{
-		char		pageserver_masked[BLCKSZ];
-		char		mdbuf[BLCKSZ];
-		char		mdbuf_masked[BLCKSZ];
-
-		mdread(reln, forkNum, blkno, mdbuf);
-
-		memcpy(pageserver_masked, buffer, BLCKSZ);
-		memcpy(mdbuf_masked, mdbuf, BLCKSZ);
-
-		if (PageIsNew(mdbuf))
-		{
-			if (!PageIsNew(pageserver_masked))
-			{
-				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-					 blkno,
-					 reln->smgr_rnode.node.spcNode,
-					 reln->smgr_rnode.node.dbNode,
-					 reln->smgr_rnode.node.relNode,
-					 forkNum,
-					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-					 hexdump_page(buffer));
-			}
-		}
-		else if (PageIsNew(buffer))
-		{
-			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-				 blkno,
-				 reln->smgr_rnode.node.spcNode,
-				 reln->smgr_rnode.node.dbNode,
-				 reln->smgr_rnode.node.relNode,
-				 forkNum,
-				 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-				 hexdump_page(mdbuf));
-		}
-		else if (PageGetSpecialSize(mdbuf) == 0)
-		{
-			/* assume heap */
-			RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
-			RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
-
-			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
-			{
-				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
-					 blkno,
-					 reln->smgr_rnode.node.spcNode,
-					 reln->smgr_rnode.node.dbNode,
-					 reln->smgr_rnode.node.relNode,
-					 forkNum,
-					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-					 hexdump_page(mdbuf_masked),
-					 hexdump_page(pageserver_masked));
-			}
-		}
-		else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData)))
-		{
-			if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
-			{
-				/* assume btree */
-				RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
-				RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
-
-				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
-				{
-					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
-						 blkno,
-						 reln->smgr_rnode.node.spcNode,
-						 reln->smgr_rnode.node.dbNode,
-						 reln->smgr_rnode.node.relNode,
-						 forkNum,
-						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-						 hexdump_page(mdbuf_masked),
-						 hexdump_page(pageserver_masked));
-				}
-			}
-		}
-	}
-#endif
-}
-
-#ifdef DEBUG_COMPARE_LOCAL
-static char *
-hexdump_page(char *page)
-{
-	StringInfoData result;
-
-	initStringInfo(&result);
-
-	for (int i = 0; i < BLCKSZ; i++)
-	{
-		if (i % 8 == 0)
-			appendStringInfo(&result, " ");
-		if (i % 40 == 0)
-			appendStringInfo(&result, "\n");
-		appendStringInfo(&result, "%02x", (unsigned char) (page[i]));
-	}
-
-	return result.data;
-}
-#endif
-
-/*
- *	zenith_write() -- Write the supplied block at the appropriate location.
- *
- *		This is to be used only for updating already-existing blocks of a
- *		relation (ie, those before the current EOF).  To extend a relation,
- *		use mdextend().
- */
-void
-zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-			 char *buffer, bool skipFsync)
-{
-	XLogRecPtr	lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			/* This is a bit tricky. Check if the relation exists locally */
-			if (mdexists(reln, forknum))
-			{
-				/* It exists locally. Guess it's unlogged then. */
-				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
-
-				/*
-				 * We could set relpersistence now that we have determined
-				 * that it's local. But we don't dare to do it, because that
-				 * would immediately allow reads as well, which shouldn't
-				 * happen. We could cache it with a different 'relpersistence'
-				 * value, but this isn't performance critical.
-				 */
-				return;
-			}
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	zenith_wallog_page(reln, forknum, blocknum, buffer);
-
-	lsn = PageGetLSN(buffer);
-	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
-		 forknum, blocknum,
-		 (uint32) (lsn >> 32), (uint32) lsn);
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
-#endif
-}
-
-/*
- *	zenith_nblocks() -- Get the number of blocks stored in a relation.
- */
-BlockNumber
-zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
-{
-	ZenithResponse *resp;
-	BlockNumber n_blocks;
-	bool		latest;
-	XLogRecPtr	request_lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			return mdnblocks(reln, forknum);
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
-	{
-		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, n_blocks);
-		return n_blocks;
-	}
-
-	request_lsn = zenith_get_request_lsn(&latest);
-	{
-		ZenithNblocksRequest request = {
-			.req.tag = T_ZenithNblocksRequest,
-			.req.latest = latest,
-			.req.lsn = request_lsn,
-			.rnode = reln->smgr_rnode.node,
-			.forknum = forknum,
-		};
-
-		resp = page_server->request((ZenithRequest *) &request);
-	}
-
-	switch (resp->tag)
-	{
-		case T_ZenithNblocksResponse:
-			n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks;
-			break;
-
-		case T_ZenithErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
-							forknum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s",
-							   ((ZenithErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
-	}
-	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
-
-	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
-		 forknum,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 n_blocks);
-
-	pfree(resp);
-	return n_blocks;
-}
-
-/*
- *	zenith_db_size() -- Get the size of the database in bytes.
- */
-const int64
-zenith_dbsize(Oid dbNode)
-{
-	ZenithResponse *resp;
-	int64 db_size;
-	XLogRecPtr request_lsn;
-	bool		latest;
-
-	request_lsn = zenith_get_request_lsn(&latest);
-	{
-		ZenithDbSizeRequest request = {
-			.req.tag = T_ZenithDbSizeRequest,
-			.req.latest = latest,
-			.req.lsn = request_lsn,
-			.dbNode = dbNode,
-		};
-
-		resp = page_server->request((ZenithRequest *) &request);
-	}
-
-	switch (resp->tag)
-	{
-		case T_ZenithDbSizeResponse:
-			db_size = ((ZenithDbSizeResponse *) resp)->db_size;
-			break;
-
-		case T_ZenithErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s",
-							   ((ZenithErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
-	}
-
-	elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-		 dbNode,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 db_size);
-
-	pfree(resp);
-	return db_size;
-}
-
-/*
- *	zenith_truncate() -- Truncate relation to specified number of blocks.
- */
-void
-zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
-{
-	XLogRecPtr	lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdtruncate(reln, forknum, nblocks);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
-
-	/*
-	 * Truncating a relation drops all its buffers from the buffer cache
-	 * without calling smgrwrite() on them. But we must account for that in
-	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
-	 * request must return the new size after the truncation. We don't know
-	 * what the LSN of the truncation record was, so be conservative and use
-	 * the most recently inserted WAL record's LSN.
-	 */
-	lsn = GetXLogInsertRecPtr();
-
-	lsn = zm_adjust_lsn(lsn);
-
-	/*
-	 * Flush it, too. We don't actually care about it here, but let's uphold
-	 * the invariant that last-written LSN <= flush LSN.
-	 */
-	XLogFlush(lsn);
-
-	SetLastWrittenPageLSN(lsn);
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdtruncate(reln, forknum, nblocks);
-#endif
-}
-
-/*
- *	zenith_immedsync() -- Immediately sync a relation to stable storage.
- *
- * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.  We
- * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
- * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
- * some segment, then mdtruncate() renders that segment inactive.  If we
- * crash before the next checkpoint syncs the newly-inactive segment, that
- * segment may survive recovery, reintroducing unwanted data into the table.
- */
-void
-zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
-{
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdimmedsync(reln, forknum);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdimmedsync(reln, forknum);
-#endif
-}
-
-/*
- * zenith_start_unlogged_build() -- Starting build operation on a rel.
- *
- * Some indexes are built in two phases, by first populating the table with
- * regular inserts, using the shared buffer cache but skipping WAL-logging,
- * and WAL-logging the whole relation after it's done. Zenith relies on the
- * WAL to reconstruct pages, so we cannot use the page server in the
- * first phase when the changes are not logged.
- */
-static void
-zenith_start_unlogged_build(SMgrRelation reln)
-{
-	/*
-	 * Currently, there can be only one unlogged relation build operation in
-	 * progress at a time. That's enough for the current usage.
-	 */
-	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-		elog(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);
-
-	ereport(SmgrTrace,
-			(errmsg("starting unlogged build of relation %u/%u/%u",
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode)));
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
-			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
-		elog(ERROR, "cannot perform unlogged index build, index is not empty ");
-
-	unlogged_build_rel = reln;
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
-
-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
-	/*
-	 * FIXME: should we pass isRedo true to create the tablespace dir if it
-	 * doesn't exist? Is it needed?
-	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
-}
-
-/*
- * zenith_finish_unlogged_build_phase_1()
- *
- * Call this after you have finished populating a relation in unlogged mode,
- * before you start WAL-logging it.
- */
-static void
-zenith_finish_unlogged_build_phase_1(SMgrRelation reln)
-{
-	Assert(unlogged_build_rel == reln);
-
-	ereport(SmgrTrace,
-			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode)));
-
-	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
-		return;
-
-	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
-
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
-}
-
-/*
- * zenith_end_unlogged_build() -- Finish an unlogged rel build.
- *
- * Call this after you have finished WAL-logging an relation that was
- * first populated without WAL-logging.
- *
- * This removes the local copy of the rel, since it's now been fully
- * WAL-logged and is present in the page server.
- */
-static void
-zenith_end_unlogged_build(SMgrRelation reln)
-{
-	Assert(unlogged_build_rel == reln);
-
-	ereport(SmgrTrace,
-			(errmsg("ending unlogged build of relation %u/%u/%u",
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode)));
-
-	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
-	{
-		RelFileNodeBackend rnode;
-
-		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
-
-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
-		/* Remove local copy */
-		rnode = reln->smgr_rnode;
-		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
-				 rnode.node.spcNode,
-				 rnode.node.dbNode,
-				 rnode.node.relNode,
-				 forknum);
-
-			forget_cached_relsize(rnode.node, forknum);
-			mdclose(reln, forknum);
-			/* use isRedo == true, so that we drop it immediately */
-			mdunlink(rnode, forknum, true);
-		}
-	}
-
-	unlogged_build_rel = NULL;
-	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-}
-
-static void
-AtEOXact_zenith(XactEvent event, void *arg)
-{
-	switch (event)
-	{
-		case XACT_EVENT_ABORT:
-		case XACT_EVENT_PARALLEL_ABORT:
-
-			/*
-			 * Forget about any build we might have had in progress. The local
-			 * file will be unlinked by smgrDoPendingDeletes()
-			 */
-			unlogged_build_rel = NULL;
-			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-			break;
-
-		case XACT_EVENT_COMMIT:
-		case XACT_EVENT_PARALLEL_COMMIT:
-		case XACT_EVENT_PREPARE:
-		case XACT_EVENT_PRE_COMMIT:
-		case XACT_EVENT_PARALLEL_PRE_COMMIT:
-		case XACT_EVENT_PRE_PREPARE:
-			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-			{
-				unlogged_build_rel = NULL;
-				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-				ereport(ERROR,
-						(errcode(ERRCODE_INTERNAL_ERROR),
-						 (errmsg("unlogged index build was not properly finished"))));
-			}
-			break;
-	}
-}
-
-static const struct f_smgr zenith_smgr =
-{
-	.smgr_init = zenith_init,
-	.smgr_shutdown = NULL,
-	.smgr_open = zenith_open,
-	.smgr_close = zenith_close,
-	.smgr_create = zenith_create,
-	.smgr_exists = zenith_exists,
-	.smgr_unlink = zenith_unlink,
-	.smgr_extend = zenith_extend,
-	.smgr_prefetch = zenith_prefetch,
-	.smgr_read = zenith_read,
-	.smgr_write = zenith_write,
-	.smgr_writeback = zenith_writeback,
-	.smgr_nblocks = zenith_nblocks,
-	.smgr_truncate = zenith_truncate,
-	.smgr_immedsync = zenith_immedsync,
-
-	.smgr_start_unlogged_build = zenith_start_unlogged_build,
-	.smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1,
-	.smgr_end_unlogged_build = zenith_end_unlogged_build,
-};
-
-
-const f_smgr *
-smgr_zenith(BackendId backend, RelFileNode rnode)
-{
-
-	/* Don't use page server for temp relations */
-	if (backend != InvalidBackendId)
-		return smgr_standard(backend, rnode);
-	else
-		return &zenith_smgr;
-}
-
-void
-smgr_init_zenith(void)
-{
-	RegisterXactCallback(AtEOXact_zenith, NULL);
-
-	smgr_init_standard();
-	zenith_init();
-}
diff --git a/contrib/neon/relsize_cache.c b/contrib/neon/relsize_cache.c
deleted file mode 100644
index 8dfcffe1d16..00000000000
--- a/contrib/neon/relsize_cache.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * relsize_cache.c
- *      Relation size cache for better zentih performance.
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  contrib/neon/relsize_cache.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "pagestore_client.h"
-#include "storage/relfilenode.h"
-#include "storage/smgr.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
-#include "storage/shmem.h"
-#include "catalog/pg_tablespace_d.h"
-#include "utils/dynahash.h"
-#include "utils/guc.h"
-
-
-typedef struct
-{
-	RelFileNode rnode;
-	ForkNumber	forknum;
-} RelTag;
-
-typedef struct
-{
-	RelTag		tag;
-	BlockNumber size;
-} RelSizeEntry;
-
-static HTAB *relsize_hash;
-static LWLockId relsize_lock;
-static int	relsize_hash_size;
-static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
-
-/*
- * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
- * which seems reasonable.
- */
-#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
-
-static void
-zenith_smgr_shmem_startup(void)
-{
-	static HASHCTL info;
-
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
-	info.keysize = sizeof(RelTag);
-	info.entrysize = sizeof(RelSizeEntry);
-	relsize_hash = ShmemInitHash("neon_relsize",
-								 relsize_hash_size, relsize_hash_size,
-								 &info,
-								 HASH_ELEM | HASH_BLOBS);
-	LWLockRelease(AddinShmemInitLock);
-}
-
-bool
-get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
-{
-	bool		found = false;
-
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-
-		tag.rnode = rnode;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_SHARED);
-		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
-		if (entry != NULL)
-		{
-			*size = entry->size;
-			found = true;
-		}
-		LWLockRelease(relsize_lock);
-	}
-	return found;
-}
-
-void
-set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
-{
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-
-		tag.rnode = rnode;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
-		entry->size = size;
-		LWLockRelease(relsize_lock);
-	}
-}
-
-void
-update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
-{
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-		bool		found;
-
-		tag.rnode = rnode;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
-		if (!found || entry->size < size)
-			entry->size = size;
-		LWLockRelease(relsize_lock);
-	}
-}
-
-void
-forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
-{
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-
-		tag.rnode = rnode;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
-		LWLockRelease(relsize_lock);
-	}
-}
-
-void
-relsize_hash_init(void)
-{
-	DefineCustomIntVariable("neon.relsize_hash_size",
-							"Sets the maximum number of cached relation sizes for neon",
-							NULL,
-							&relsize_hash_size,
-							DEFAULT_RELSIZE_HASH_SIZE,
-							0,
-							INT_MAX,
-							PGC_POSTMASTER,
-							0,
-							NULL, NULL, NULL);
-
-	if (relsize_hash_size > 0)
-	{
-		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
-		RequestNamedLWLockTranche("neon_relsize", 1);
-
-		prev_shmem_startup_hook = shmem_startup_hook;
-		shmem_startup_hook = zenith_smgr_shmem_startup;
-	}
-}
diff --git a/contrib/neon/walproposer.c b/contrib/neon/walproposer.c
deleted file mode 100644
index 9625325c0a9..00000000000
--- a/contrib/neon/walproposer.c
+++ /dev/null
@@ -1,2403 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * walproposer.c
- *
- * Proposer/leader part of the total order broadcast protocol between postgres
- * and WAL safekeepers.
- *
- * We have two ways of launching WalProposer:
- *
- *   1. As a background worker which will run physical WalSender with
- *      am_wal_proposer flag set to true. WalSender in turn would handle WAL
- *      reading part and call WalProposer when ready to scatter WAL.
- *
- *   2. As a standalone utility by running `postgres --sync-safekeepers`. That
- *      is needed to create LSN from which it is safe to start postgres. More
- *      specifically it addresses following problems:
- *
- *      a) Chicken-or-the-egg problem: compute postgres needs data directory
- *         with non-rel files that are downloaded from pageserver by calling
- *         basebackup@LSN. This LSN is not arbitrary, it must include all
- *         previously committed transactions and defined through consensus
- *         voting, which happens... in walproposer, a part of compute node.
- *
- *      b) Just warranting such LSN is not enough, we must also actually commit
- *         it and make sure there is a safekeeper who knows this LSN is
- *         committed so WAL before it can be streamed to pageserver -- otherwise
- *         basebackup will hang waiting for WAL. Advancing commit_lsn without
- *         playing consensus game is impossible, so speculative 'let's just poll
- *         safekeepers, learn start LSN of future epoch and run basebackup'
- *         won't work.
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include <signal.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include "access/xlogdefs.h"
-#include "access/xlogutils.h"
-#include "storage/latch.h"
-#include "miscadmin.h"
-#include "pgstat.h"
-#include "access/xlog.h"
-#include "libpq/pqformat.h"
-#include "replication/slot.h"
-#include "replication/walreceiver.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
-#include "postmaster/postmaster.h"
-#include "storage/pmsignal.h"
-#include "storage/proc.h"
-#include "storage/ipc.h"
-#include "storage/lwlock.h"
-#include "storage/shmem.h"
-#include "storage/spin.h"
-#include "tcop/tcopprot.h"
-#include "utils/builtins.h"
-#include "utils/guc.h"
-#include "utils/memutils.h"
-#include "utils/timestamp.h"
-
-#include "neon.h"
-#include "walproposer.h"
-#include "walproposer_utils.h"
-#include "replication/walpropshim.h"
-
-
-char	   *wal_acceptors_list;
-int			wal_acceptor_reconnect_timeout;
-int			wal_acceptor_connect_timeout;
-bool		am_wal_proposer;
-
-char	   *zenith_timeline_walproposer = NULL;
-char	   *zenith_tenant_walproposer = NULL;
-
-/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
-WalProposerFunctionsType *WalProposerFunctions = NULL;
-
-#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
-
-static int	n_safekeepers = 0;
-static int	quorum = 0;
-static Safekeeper safekeeper[MAX_SAFEKEEPERS];
-static XLogRecPtr availableLsn;	/* WAL has been generated up to this point */
-static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */
-static ProposerGreeting greetRequest;
-static VoteRequest voteRequest; /* Vote request for safekeeper */
-static WaitEventSet *waitEvents;
-static AppendResponse quorumFeedback;
-/*
- *  Minimal LSN which may be needed for recovery of some safekeeper,
- *  record-aligned (first record which might not yet received by someone).
- */
-static XLogRecPtr truncateLsn;
-/*
- * Term of the proposer. We want our term to be highest and unique,
- * so we collect terms from safekeepers quorum, choose max and +1.
- * After that our term is fixed and must not change. If we observe
- * that some safekeeper has higher term, it means that we have another
- * running compute, so we must stop immediately.
- */
-static term_t propTerm;
-static TermHistory propTermHistory; /* term history of the proposer */
-static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
-static term_t donorEpoch;		/* Most advanced acceptor epoch */
-static int	donor;				/* Most advanced acceptor */
-static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
-static int	n_votes = 0;
-static int	n_connected = 0;
-static TimestampTz last_reconnect_attempt;
-
-static WalproposerShmemState *walprop_shared;
-
-/* Prototypes for private functions */
-static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
-static void WalProposerStartImpl(void);
-static void WalProposerLoop(void);
-static void InitEventSet(void);
-static void UpdateEventSet(Safekeeper *sk, uint32 events);
-static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
-static void ShutdownConnection(Safekeeper *sk);
-static void ResetConnection(Safekeeper *sk);
-static long TimeToReconnect(TimestampTz now);
-static void ReconnectSafekeepers(void);
-static void AdvancePollState(Safekeeper *sk, uint32 events);
-static void HandleConnectionEvent(Safekeeper *sk);
-static void SendStartWALPush(Safekeeper *sk);
-static void RecvStartWALPushResult(Safekeeper *sk);
-static void SendProposerGreeting(Safekeeper *sk);
-static void RecvAcceptorGreeting(Safekeeper *sk);
-static void SendVoteRequest(Safekeeper *sk);
-static void RecvVoteResponse(Safekeeper *sk);
-static void HandleElectedProposer(void);
-static term_t GetHighestTerm(TermHistory *th);
-static term_t GetEpoch(Safekeeper *sk);
-static void DetermineEpochStartLsn(void);
-static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
-static void SendProposerElected(Safekeeper *sk);
-static void WalProposerStartStreaming(XLogRecPtr startpos);
-static void StartStreaming(Safekeeper *sk);
-static void SendMessageToNode(Safekeeper *sk);
-static void BroadcastAppendRequest(void);
-static void HandleActiveState(Safekeeper *sk, uint32 events);
-static bool SendAppendRequests(Safekeeper *sk);
-static bool RecvAppendResponses(Safekeeper *sk);
-static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
-static XLogRecPtr CalculateMinFlushLsn(void);
-static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
-static void HandleSafekeeperResponse(void);
-static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
-static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
-static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
-static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
-static bool AsyncFlush(Safekeeper *sk);
-
-
-static void nwp_shmem_startup_hook(void);
-static void nwp_register_gucs(void);
-static void nwp_prepare_shmem(void);
-static uint64 backpressure_lag_impl(void);
-
-
-static shmem_startup_hook_type prev_shmem_startup_hook_type;
-
-
-
-void pg_init_walproposer(void)
-{
-	if (!process_shared_preload_libraries_in_progress)
-		return;
-
-	nwp_register_gucs();
-
-	nwp_prepare_shmem();
-
-	delay_backend_us = &backpressure_lag_impl;
-
-	WalProposerRegister();
-	
-	WalProposerInit = &WalProposerInitImpl;
-	WalProposerStart = &WalProposerStartImpl;
-}
-
-static void nwp_register_gucs(void)
-{
-	DefineCustomStringVariable(
-		"neon.safekeepers",
-		"List of Neon WAL acceptors (host:port)",
-		NULL, /* long_desc */
-		&wal_acceptors_list, /* valueAddr */
-		"", /* bootValue */
-		PGC_POSTMASTER,
-		GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */
-		NULL, NULL, NULL
-	);
-
-	DefineCustomIntVariable(
-		"neon.safekeeper_reconnect_timeout",
-		"Timeout for reconnecting to offline wal acceptor.",
-		NULL,
-		&wal_acceptor_reconnect_timeout,
-		1000, 0, INT_MAX, /* default, min, max */
-		PGC_SIGHUP, /* context */
-		GUC_UNIT_MS, /* flags */
-		NULL, NULL, NULL
-	);
-
-	DefineCustomIntVariable(
-		"neon.safekeeper_connect_timeout",
-		"Timeout after which give up connection attempt to safekeeper.",
-		NULL,
-		&wal_acceptor_connect_timeout,
-		5000, 0, INT_MAX,
-		PGC_SIGHUP,
-		GUC_UNIT_MS,
-		NULL, NULL, NULL
-	);
-	
-}
-
-/* shmem handling */
-
-static void nwp_prepare_shmem(void)
-{
-	RequestAddinShmemSpace(WalproposerShmemSize());
-
-	prev_shmem_startup_hook_type = shmem_startup_hook;
-	shmem_startup_hook = nwp_shmem_startup_hook;
-}
-
-static void nwp_shmem_startup_hook(void)
-{
-	if (prev_shmem_startup_hook_type)
-		prev_shmem_startup_hook_type();
-
-	WalproposerShmemInit();
-}
-
-/*
- * WAL proposer bgworker entry point.
- */
-void
-WalProposerMain(Datum main_arg)
-{
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	GetXLogReplayRecPtr(&ThisTimeLineID);
-
-	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
-
-	last_reconnect_attempt = GetCurrentTimestamp();
-
-	application_name = (char *) "walproposer";	/* for
-												 * synchronous_standby_names */
-	am_wal_proposer = true;
-	am_walsender = true;
-	InitWalSender();
-	InitProcessPhase2();
-
-	/* Create replication slot for WAL proposer if not exists */
-	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
-	{
-		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
-		ReplicationSlotReserveWal();
-		/* Write this slot to disk */
-		ReplicationSlotMarkDirty();
-		ReplicationSlotSave();
-		ReplicationSlotRelease();
-	}
-
-	WalProposerStart();
-}
-
-/*
- * Create new AppendRequest message and start sending it. This function is
- * called from walsender every time the new WAL is available.
- */
-void
-WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos)
-{
-	Assert(startpos == availableLsn && endpos >= availableLsn);
-	availableLsn = endpos;
-	BroadcastAppendRequest();
-}
-
-/*
- * Advance the WAL proposer state machine, waiting each time for events to occur.
- * Will exit only when latch is set, i.e. new WAL should be pushed from walsender
- * to walproposer.
- */
-void
-WalProposerPoll(void)
-{
-	while (true)
-	{
-		Safekeeper  *sk;
-		int			rc;
-		WaitEvent	event;
-		TimestampTz now = GetCurrentTimestamp();
-
-		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		sk = (Safekeeper *) event.user_data;
-
-		/*
-		 * If the event contains something that one of our safekeeper states
-		 * was waiting for, we'll advance its state.
-		 */
-		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(sk, event.events);
-
-		/*
-		 * If the timeout expired, attempt to reconnect to any safekeepers that
-		 * we dropped
-		 */
-		ReconnectSafekeepers();
-
-		/*
-		 * If wait is terminated by latch set (walsenders' latch is set on
-		 * each wal flush), then exit loop. (no need for pm death check due to
-		 * WL_EXIT_ON_PM_DEATH)
-		 */
-		if (rc != 0 && (event.events & WL_LATCH_SET))
-		{
-			ResetLatch(MyLatch);
-			break;
-		}
-		if (rc == 0) /* timeout expired: poll state */
-		{
-			TimestampTz now;
-
-			/*
-			 * If no WAL was generated during timeout (and we have already
-			 * collected the quorum), then send pool message
-			 */
-			if (availableLsn != InvalidXLogRecPtr)
-			{
-				BroadcastAppendRequest();
-			}
-
-			/*
-			 * Abandon connection attempts which take too long.
-			 */
-			now = GetCurrentTimestamp();
-			for (int i = 0; i < n_safekeepers; i++)
-			{
-				Safekeeper  *sk = &safekeeper[i];
-
-				if ((sk->state == SS_CONNECTING_WRITE ||
-				     sk->state == SS_CONNECTING_READ) &&
-					TimestampDifferenceExceeds(sk->startedConnAt, now,
-										   	   wal_acceptor_connect_timeout))
-				{
-					elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
-						 sk->host, sk->port, wal_acceptor_connect_timeout);
-					ShutdownConnection(sk);
-				}
-			}
-		}
-	}
-}
-
-/*
- * Register a background worker proposing WAL to wal acceptors.
- */
-void
-WalProposerRegister(void)
-{
-	BackgroundWorker bgw;
-
-	if (*wal_acceptors_list == '\0')
-		return;
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
-static void
-WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
-{
-	char	   *host;
-	char	   *sep;
-	char	   *port;
-
-	/* Load the libpq-specific functions */
-	if (WalProposerFunctions == NULL)
-		elog(ERROR, "libpqwalproposer didn't initialize correctly");
-
-	load_file("libpqwalreceiver", false);
-	if (WalReceiverFunctions == NULL)
-		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-
-	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
-	{
-		port = strchr(host, ':');
-		if (port == NULL)
-		{
-			elog(FATAL, "port is not specified");
-		}
-		*port++ = '\0';
-		sep = strchr(port, ',');
-		if (sep != NULL)
-			*sep++ = '\0';
-		if (n_safekeepers + 1 >= MAX_SAFEKEEPERS)
-		{
-			elog(FATAL, "Too many safekeepers");
-		}
-		safekeeper[n_safekeepers].host = host;
-		safekeeper[n_safekeepers].port = port;
-		safekeeper[n_safekeepers].state = SS_OFFLINE;
-		safekeeper[n_safekeepers].conn = NULL;
-
-		/*
-		 * Set conninfo to empty. We'll fill it out once later, in
-		 * `ResetConnection` as needed
-		 */
-		safekeeper[n_safekeepers].conninfo[0] = '\0';
-		initStringInfo(&safekeeper[n_safekeepers].outbuf);
-		safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL);
-		if (safekeeper[n_safekeepers].xlogreader == NULL)
-			elog(FATAL, "Failed to allocate xlog reader");
-		safekeeper[n_safekeepers].flushWrite = false;
-		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
-		safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
-		n_safekeepers += 1;
-	}
-	if (n_safekeepers < 1)
-	{
-		elog(FATAL, "Safekeepers addresses are not specified");
-	}
-	quorum = n_safekeepers / 2 + 1;
-
-	/* Fill the greeting package */
-	greetRequest.tag = 'g';
-	greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
-	greetRequest.pgVersion = PG_VERSION_NUM;
-	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
-	greetRequest.systemId = systemId;
-	if (!zenith_timeline_walproposer)
-		elog(FATAL, "neon.timeline_id is not provided");
-	if (*zenith_timeline_walproposer != '\0' &&
-		!HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer);
-	if (!zenith_tenant_walproposer)
-		elog(FATAL, "neon.tenant_id is not provided");
-	if (*zenith_tenant_walproposer != '\0' &&
-		!HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer);
-
-	greetRequest.timeline = ThisTimeLineID;
-	greetRequest.walSegSize = wal_segment_size;
-
-	InitEventSet();
-}
-
-static void
-WalProposerStartImpl(void)
-{
-
-	/* Initiate connections to all safekeeper nodes */
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		ResetConnection(&safekeeper[i]);
-	}
-
-	WalProposerLoop();
-}
-
-static void
-WalProposerLoop(void)
-{
-	while (true)
-		WalProposerPoll();
-}
-
-/* Initializes the internal event set, provided that it is currently null */
-static void
-InitEventSet(void)
-{
-	if (waitEvents)
-		elog(FATAL, "double-initialization of event set");
-
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
-	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
-}
-
-/*
- * Updates the events we're already waiting on for the safekeeper, setting it to
- * the provided `events`
- *
- * This function is called any time the safekeeper's state switches to one where
- * it has to wait to continue. This includes the full body of AdvancePollState
- * and calls to IO helper functions.
- */
-static void
-UpdateEventSet(Safekeeper *sk, uint32 events)
-{
-	/* eventPos = -1 when we don't have an event */
-	Assert(sk->eventPos != -1);
-
-	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
-}
-
-/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
- *
- * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
- */
-static void
-HackyRemoveWalProposerEvent(Safekeeper *to_remove)
-{
-	/* Remove the existing event set */
-	if (waitEvents)
-	{
-		FreeWaitEventSet(waitEvents);
-		waitEvents = NULL;
-	}
-	/* Re-initialize it without adding any safekeeper events */
-	InitEventSet();
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		uint32		desired_events = WL_NO_EVENTS;
-		Safekeeper  *sk = &safekeeper[i];
-
-		sk->eventPos = -1;
-
-		if (sk == to_remove)
-			continue;
-
-		/* If this safekeeper isn't offline, add an event for it! */
-		if (sk->conn != NULL)
-		{
-			desired_events = SafekeeperStateDesiredEvents(sk->state);
-			sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk);
-		}
-	}
-}
-
-/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
-static void
-ShutdownConnection(Safekeeper *sk)
-{
-	if (sk->conn)
-		walprop_finish(sk->conn);
-	sk->conn = NULL;
-	sk->state = SS_OFFLINE;
-	sk->flushWrite = false;
-	sk->streamingAt = InvalidXLogRecPtr;
-
-	if (sk->voteResponse.termHistory.entries)
-		pfree(sk->voteResponse.termHistory.entries);
-	sk->voteResponse.termHistory.entries = NULL;
-
-	HackyRemoveWalProposerEvent(sk);
-}
-
-/*
- * This function is called to establish new connection or to reestablish
- * connection in case of connection failure.
- *
- * On success, sets the state to SS_CONNECTING_WRITE.
- */
-static void
-ResetConnection(Safekeeper *sk)
-{
-	pgsocket	sock;			/* socket of the new connection */
-
-	if (sk->state != SS_OFFLINE)
-	{
-		ShutdownConnection(sk);
-	}
-
-	/*
-	 * Try to establish new connection
-	 *
-	 * If the connection information hasn't been filled out, we need to do
-	 * that here.
-	 */
-	if (sk->conninfo[0] == '\0')
-	{
-		int written = 0;
-		written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
-				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-				sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
-		// currently connection string is not that long, but once we pass something like jwt we might overflow the buffer,
-		// so it is better to be defensive and check that everything aligns well
-		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
-	}
-
-	sk->conn = walprop_connect_start((char *) &sk->conninfo);
-
-	/*
-	 * "If the result is null, then libpq has been unable to allocate a new
-	 * PGconn structure"
-	 */
-	if (!sk->conn)
-		elog(FATAL, "failed to allocate new PGconn object");
-
-	/*
-	 * PQconnectStart won't actually start connecting until we run
-	 * PQconnectPoll. Before we do that though, we need to check that it
-	 * didn't immediately fail.
-	 */
-	if (walprop_status(sk->conn) == WP_CONNECTION_BAD)
-	{
-		/*---
-		 * According to libpq docs:
-		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed,
-		 *    typically because of invalid connection parameters."
-		 * We should report this failure.
-		 *
-		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
-		 */
-		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
-			 sk->conninfo, walprop_error_message(sk->conn));
-
-		/*
-		 * Even though the connection failed, we still need to clean up the
-		 * object
-		 */
-		walprop_finish(sk->conn);
-		sk->conn = NULL;
-		return;
-	}
-
-	/*
-	 * The documentation for PQconnectStart states that we should call
-	 * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or
-	 * PGRES_POLLING_FAILED. The other two possible returns indicate whether
-	 * we should wait for reading or writing on the socket. For the first
-	 * iteration of the loop, we're expected to wait until the socket becomes
-	 * writable.
-	 *
-	 * The wording of the documentation is a little ambiguous; thankfully
-	 * there's an example in the postgres source itself showing this behavior.
-	 * (see libpqrcv_connect, defined in
-	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
-	 */
-	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
-
-	sk->state = SS_CONNECTING_WRITE;
-	sk->startedConnAt = GetCurrentTimestamp();
-
-	sock = walprop_socket(sk->conn);
-	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
-	return;
-}
-
-/*
- * How much milliseconds left till we should attempt reconnection to
- * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
- * (do we actually need this?).
- */
-static long
-TimeToReconnect(TimestampTz now)
-{
-	TimestampTz passed;
-	TimestampTz till_reconnect;
-
-	if (wal_acceptor_reconnect_timeout <= 0)
-		return -1;
-
-	passed = now - last_reconnect_attempt;
-	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
-	if (till_reconnect <= 0)
-		return 0;
-	return (long) (till_reconnect / 1000);
-}
-
-/* If the timeout has expired, attempt to reconnect to all offline safekeepers */
-static void
-ReconnectSafekeepers(void)
-{
-	TimestampTz now = GetCurrentTimestamp();
-
-	if (TimeToReconnect(now) == 0)
-	{
-		last_reconnect_attempt = now;
-		for (int i = 0; i < n_safekeepers; i++)
-		{
-			if (safekeeper[i].state == SS_OFFLINE)
-				ResetConnection(&safekeeper[i]);
-		}
-	}
-}
-
-/*
- * Performs the logic for advancing the state machine of the specified safekeeper,
- * given that a certain set of events has occured.
- */
-static void
-AdvancePollState(Safekeeper *sk, uint32 events)
-{
-	/*
-	 * Sanity check. We assume further down that the operations don't
-	 * block because the socket is ready.
-	 */
-	AssertEventsOkForState(events, sk);
-
-	/* Execute the code corresponding to the current state */
-	switch (sk->state)
-	{
-			/*
-			 * safekeepers are only taken out of SS_OFFLINE by calls to
-			 * ResetConnection
-			 */
-		case SS_OFFLINE:
-			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-				 sk->host, sk->port);
-			break;			/* actually unreachable, but prevents
-							 * -Wimplicit-fallthrough */
-
-			/*
-			 * Both connecting states run the same logic. The only
-			 * difference is the events they're expecting
-			 */
-		case SS_CONNECTING_READ:
-		case SS_CONNECTING_WRITE:
-			HandleConnectionEvent(sk);
-			break;
-
-			/*
-			 * Waiting for a successful CopyBoth response.
-			 */
-		case SS_WAIT_EXEC_RESULT:
-			RecvStartWALPushResult(sk);
-			break;
-
-			/*
-			 * Finish handshake comms: receive information about the safekeeper.
-			 */
-		case SS_HANDSHAKE_RECV:
-			RecvAcceptorGreeting(sk);
-			break;
-
-			/*
-			 * Voting is an idle state - we don't expect any events to trigger.
-			 * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are
-			 * transferred from SS_VOTING to sending actual vote requests.
-			 */
-		case SS_VOTING:
-			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
-					sk->port, FormatSafekeeperState(sk->state));
-			ResetConnection(sk);
-			return;
-
-			/* Read the safekeeper response for our candidate */
-		case SS_WAIT_VERDICT:
-			RecvVoteResponse(sk);
-			break;
-
-			/* Flush proposer announcement message */
-		case SS_SEND_ELECTED_FLUSH:
-
-			/*
-			 * AsyncFlush ensures we only move on to SS_ACTIVE once the flush
-			 * completes. If we still have more to do, we'll wait until the next
-			 * poll comes along.
-			 */
-			if (!AsyncFlush(sk))
-				return;
-
-			/* flush is done, event set and state will be updated later */
-			StartStreaming(sk);
-			break;
-
-			/*
-			 * Idle state for waiting votes from quorum.
-			 */
-		case SS_IDLE:
-			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
-					sk->port, FormatSafekeeperState(sk->state));
-			ResetConnection(sk);
-			return;
-
-			/*
-			 * Active state is used for streaming WAL and receiving feedback.
-			 */
-		case SS_ACTIVE:
-			HandleActiveState(sk, events);
-			break;
-	}
-}
-
-static void
-HandleConnectionEvent(Safekeeper *sk)
-{
-	WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn);
-
-	/* The new set of events we'll wait on, after updating */
-	uint32		new_events = WL_NO_EVENTS;
-
-	switch (result)
-	{
-		case WP_CONN_POLLING_OK:
-			elog(LOG, "connected with node %s:%s", sk->host,
-					sk->port);
-
-			/*
-			 * We have to pick some event to update event set.
-			 * We'll eventually need the socket to be readable,
-			 * so we go with that.
-			 */
-			new_events = WL_SOCKET_READABLE;
-			break;
-
-			/*
-			 * If we need to poll to finish connecting,
-			 * continue doing that
-			 */
-		case WP_CONN_POLLING_READING:
-			sk->state = SS_CONNECTING_READ;
-			new_events = WL_SOCKET_READABLE;
-			break;
-		case WP_CONN_POLLING_WRITING:
-			sk->state = SS_CONNECTING_WRITE;
-			new_events = WL_SOCKET_WRITEABLE;
-			break;
-
-		case WP_CONN_POLLING_FAILED:
-			elog(WARNING, "failed to connect to node '%s:%s': %s",
-					sk->host, sk->port, walprop_error_message(sk->conn));
-
-			/*
-			 * If connecting failed, we don't want to restart
-			 * the connection because that might run us into a
-			 * loop. Instead, shut it down -- it'll naturally
-			 * restart at a slower interval on calls to
-			 * ReconnectSafekeepers.
-			 */
-			ShutdownConnection(sk);
-			return;
-	}
-
-	/*
-	 * Because PQconnectPoll can change the socket, we have to
-	 * un-register the old event and re-register an event on
-	 * the new socket.
-	 */
-	HackyRemoveWalProposerEvent(sk);
-	sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk);
-
-	/* If we successfully connected, send START_WAL_PUSH query */
-	if (result == WP_CONN_POLLING_OK)
-		SendStartWALPush(sk);
-}
-
-/*
- * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs
- * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something
- * goes wrong, change state to SS_OFFLINE and shutdown the connection.
- */
-static void
-SendStartWALPush(Safekeeper *sk)
-{
-	if (!walprop_send_query(sk->conn, "START_WAL_PUSH"))
-	{
-		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			sk->host, sk->port, walprop_error_message(sk->conn));
-		ShutdownConnection(sk);
-		return;
-	}
-	sk->state = SS_WAIT_EXEC_RESULT;
-	UpdateEventSet(sk, WL_SOCKET_READABLE);
-}
-
-static void
-RecvStartWALPushResult(Safekeeper *sk)
-{
-	switch (walprop_get_query_result(sk->conn))
-	{
-			/*
-			 * Successful result, move on to starting the
-			 * handshake
-			 */
-		case WP_EXEC_SUCCESS_COPYBOTH:
-
-			SendProposerGreeting(sk);
-			break;
-
-			/*
-			 * Needs repeated calls to finish. Wait until the
-			 * socket is readable
-			 */
-		case WP_EXEC_NEEDS_INPUT:
-
-			/*
-			 * SS_WAIT_EXEC_RESULT is always reached through an
-			 * event, so we don't need to update the event set
-			 */
-			break;
-
-		case WP_EXEC_FAILED:
-			elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-					sk->host, sk->port, walprop_error_message(sk->conn));
-			ShutdownConnection(sk);
-			return;
-
-			/*
-			 * Unexpected result -- funamdentally an error, but we
-			 * want to produce a custom message, rather than a
-			 * generic "something went wrong"
-			 */
-		case WP_EXEC_UNEXPECTED_SUCCESS:
-			elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
-					sk->host, sk->port);
-			ShutdownConnection(sk);
-			return;
-	}
-}
-
-/*
- * Start handshake: first of all send information about the
- * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
- * a response to finish the handshake.
- */
-static void
-SendProposerGreeting(Safekeeper *sk)
-{
-	/*
-	 * On failure, logging & resetting the connection is handled.
-	 * We just need to handle the control flow.
-	 */
-	BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV);
-}
-
-static void
-RecvAcceptorGreeting(Safekeeper *sk)
-{
-	/*
-	 * If our reading doesn't immediately succeed, any necessary
-	 * error handling or state setting is taken care of. We can
-	 * leave any other work until later.
-	 */
-	sk->greetResponse.apm.tag = 'g';
-	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
-		return;
-
-	/* Protocol is all good, move to voting. */
-	sk->state = SS_VOTING;
-
-	++n_connected;
-	if (n_connected <= quorum)
-	{
-		/* We're still collecting terms from the majority. */
-		propTerm = Max(sk->greetResponse.term, propTerm);
-
-		/* Quorum is acquried, prepare the vote request. */
-		if (n_connected == quorum)
-		{
-			propTerm++;
-			elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
-
-			voteRequest = (VoteRequest)
-			{
-				.tag = 'v',
-					.term = propTerm
-			};
-			memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN);
-		}
-	}
-	else if (sk->greetResponse.term > propTerm)
-	{
-		/* Another compute with higher term is running. */
-		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-				sk->host, sk->port,
-				sk->greetResponse.term, propTerm);
-	}
-
-	/*
-	 * Check if we have quorum. If there aren't enough safekeepers,
-	 * wait and do nothing. We'll eventually get a task when the
-	 * election starts.
-	 *
-	 * If we do have quorum, we can start an election.
-	 */
-	if (n_connected < quorum)
-	{
-		/*
-		 * SS_VOTING is an idle state; read-ready indicates the
-		 * connection closed.
-		 */
-		UpdateEventSet(sk, WL_SOCKET_READABLE);
-	}
-	else
-	{
-		/*
-		 * Now send voting request to the cohort and wait
-		 * responses
-		 */
-		for (int j = 0; j < n_safekeepers; j++)
-		{
-			/*
-			 * Remember: SS_VOTING indicates that the safekeeper is
-			 * participating in voting, but hasn't sent anything
-			 * yet.
-			 */
-			if (safekeeper[j].state == SS_VOTING)
-				SendVoteRequest(&safekeeper[j]);
-		}
-	}
-}
-
-static void
-SendVoteRequest(Safekeeper *sk)
-{
-	/* We have quorum for voting, send our vote request */
-	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
-	/* On failure, logging & resetting is handled */
-	if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
-		return;
-
-	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
-}
-
-static void
-RecvVoteResponse(Safekeeper *sk)
-{
-	sk->voteResponse.apm.tag = 'v';
-	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
-		return;
-
-	elog(LOG,
-			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-			sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-			LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-			LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
-
-	/*
-	 * In case of acceptor rejecting our vote, bail out, but only
-	 * if either it already lives in strictly higher term
-	 * (concurrent compute spotted) or we are not elected yet and
-	 * thus need the vote.
-	 */
-	if ((!sk->voteResponse.voteGiven) &&
-		(sk->voteResponse.term > propTerm || n_votes < quorum))
-	{
-		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-				sk->host, sk->port,
-				sk->voteResponse.term, propTerm);
-	}
-	Assert(sk->voteResponse.term == propTerm);
-
-	/* Handshake completed, do we have quorum? */
-	n_votes++;
-	if (n_votes < quorum)
-	{
-		sk->state = SS_IDLE; /* can't do much yet, no quorum */
-	}
-	else if (n_votes > quorum)
-	{
-		/* recovery already performed, just start streaming */
-		SendProposerElected(sk);
-	}
-	else
-	{
-		sk->state = SS_IDLE;
-		UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for
-												 * read-ready */
-
-		HandleElectedProposer();
-	}
-}
-
-/*
- * Called once a majority of acceptors have voted for us and current proposer
- * has been elected.
- *
- * Sends ProposerElected message to all acceptors in SS_IDLE state and starts
- * replication from walsender.
- */
-static void
-HandleElectedProposer(void)
-{
-	DetermineEpochStartLsn();
-
-	/*
-	 * Check if not all safekeepers are up-to-date, we need to
-	 * download WAL needed to synchronize them
-	 */
-	if (truncateLsn < propEpochStartLsn)
-	{
-		elog(LOG,
-				"start recovery because truncateLsn=%X/%X is not "
-				"equal to epochStartLsn=%X/%X",
-				LSN_FORMAT_ARGS(truncateLsn),
-				LSN_FORMAT_ARGS(propEpochStartLsn));
-		/* Perform recovery */
-		if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
-			elog(FATAL, "Failed to recover state");
-	}
-	else if (syncSafekeepers)
-	{
-		/* Sync is not needed: just exit */
-		fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-		exit(0);
-	}
-
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].state == SS_IDLE)
-			SendProposerElected(&safekeeper[i]);
-	}
-
-	/*
-	 * The proposer has been elected, and there will be no quorum waiting
-	 * after this point. There will be no safekeeper with state SS_IDLE
-	 * also, because that state is used only for quorum waiting.
-	 */
-
-	if (syncSafekeepers)
-	{
-		/*
-		 * Send empty message to enforce receiving feedback
-		 * even from nodes who are fully recovered; this is
-		 * required to learn they switched epoch which finishes
-		 * sync-safeekepers who doesn't generate any real new
-		 * records. Will go away once we switch to async acks.
-		 */
-		BroadcastAppendRequest();
-
-		/* keep polling until all safekeepers are synced */
-		return;
-	}
-
-	WalProposerStartStreaming(propEpochStartLsn);
-	/* Should not return here */
-}
-
-/* latest term in TermHistory, or 0 is there is no entries */
-static term_t
-GetHighestTerm(TermHistory *th)
-{
-	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
-}
-
-/* safekeeper's epoch is the term of the highest entry in the log */
-static term_t
-GetEpoch(Safekeeper *sk)
-{
-	return GetHighestTerm(&sk->voteResponse.termHistory);
-}
-
-/* If LSN points to the page header, skip it */
-static XLogRecPtr
-SkipXLogPageHeader(XLogRecPtr lsn)
-{
-	if (XLogSegmentOffset(lsn, wal_segment_size) == 0)
-	{
-		lsn += SizeOfXLogLongPHD;
-	}
-	else if (lsn % XLOG_BLCKSZ == 0)
-	{
-		lsn += SizeOfXLogShortPHD;
-	}
-	return lsn;
-}
-
-/*
- * Called after majority of acceptors gave votes, it calculates the most
- * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
- * which we'll write WAL in our term.
- *
- * Sets truncateLsn along the way (though it is not of much use at this point --
- * only for skipping recovery).
- */
-static void
-DetermineEpochStartLsn(void)
-{
-	TermHistory *dth;
-
-	propEpochStartLsn = InvalidXLogRecPtr;
-	donorEpoch = 0;
-	truncateLsn = InvalidXLogRecPtr;
-	timelineStartLsn = InvalidXLogRecPtr;
-
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].state == SS_IDLE)
-		{
-			if (GetEpoch(&safekeeper[i]) > donorEpoch ||
-				(GetEpoch(&safekeeper[i]) == donorEpoch &&
-				 safekeeper[i].voteResponse.flushLsn > propEpochStartLsn))
-			{
-				donorEpoch = GetEpoch(&safekeeper[i]);
-				propEpochStartLsn = safekeeper[i].voteResponse.flushLsn;
-				donor = i;
-			}
-			truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn);
-
-			if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
-			{
-				/* timelineStartLsn should be the same everywhere or unknown */
-				if (timelineStartLsn != InvalidXLogRecPtr &&
-					timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn)
-				{
-					elog(WARNING,
-						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-						 LSN_FORMAT_ARGS(timelineStartLsn),
-						 LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn));
-				}
-				timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn;
-			}
-		}
-	}
-
-	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
-	 * committed yet. Start streaming then from the basebackup LSN.
-	 */
-	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
-	{
-		propEpochStartLsn = truncateLsn = GetRedoStartLsn();
-		if (timelineStartLsn == InvalidXLogRecPtr)
-		{
-			timelineStartLsn = GetRedoStartLsn();
-		}
-		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
-	}
-
-	/*
-	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
-	 * some connected safekeeper; it must have carried truncateLsn pointing to
-	 * the first record.
-	 */
-	Assert((truncateLsn != InvalidXLogRecPtr) ||
-		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
-
-	/*
-	 * We will be generating WAL since propEpochStartLsn, so we should set
-	 * availableLsn to mark this LSN as the latest available position.
-	 */
-	availableLsn = propEpochStartLsn;
-
-	/*
-	 * Proposer's term history is the donor's + its own entry.
-	 */
-	dth = &safekeeper[donor].voteResponse.termHistory;
-	propTermHistory.n_entries = dth->n_entries + 1;
-	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
-	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
-	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
-	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
-
-	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		 quorum,
-		 propTerm,
-		 LSN_FORMAT_ARGS(propEpochStartLsn),
-		 safekeeper[donor].host, safekeeper[donor].port,
-		 LSN_FORMAT_ARGS(truncateLsn)
-		);
-
-	/*
-	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since
-	 * which we are going to write according to the consensus. If not, we must
-	 * bail out, as clog and other non rel data is inconsistent.
-	 */
-	if (!syncSafekeepers)
-	{
-		/*
-		 *  Basebackup LSN always points to the beginning of the record (not the
-		 *  page), as StartupXLOG most probably wants it this way. Safekeepers
-		 *  don't skip header as they need continious stream of data, so
-		 *  correct LSN for comparison.
-		 */
-		if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn())
-		{
-			/*
-			 * However, allow to proceed if previously elected leader was me; plain
-			 * restart of walproposer not intervened by concurrent compute (who could
-			 * generate WAL) is ok.
-			 */
-			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
-											walprop_shared->mineLastElectedTerm)))
-			{
-				elog(PANIC,
-					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-					 LSN_FORMAT_ARGS(propEpochStartLsn),
-					 LSN_FORMAT_ARGS(GetRedoStartLsn()));
-			}
-		}
-		walprop_shared->mineLastElectedTerm = propTerm;
-	}
-}
-
-/*
- * Receive WAL from most advanced safekeeper
- */
-static bool
-WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
-{
-	char		conninfo[MAXCONNINFO];
-	char	   *err;
-	WalReceiverConn *wrconn;
-	WalRcvStreamOptions options;
-
-	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-			safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
-	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
-	if (!wrconn)
-	{
-		ereport(WARNING,
-				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						safekeeper[donor].host, safekeeper[donor].port,
-						err)));
-		return false;
-	}
-	elog(LOG,
-		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
-		 "%d",
-		 safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32),
-		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
-
-	options.logical = false;
-	options.startpoint = startpos;
-	options.slotname = NULL;
-	options.proto.physical.startpointTLI = timeline;
-
-	if (walrcv_startstreaming(wrconn, &options))
-	{
-		XLogRecPtr	rec_start_lsn;
-		XLogRecPtr	rec_end_lsn = 0;
-		int			len;
-		char	   *buf;
-		pgsocket	wait_fd = PGINVALID_SOCKET;
-
-		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
-		{
-			if (len == 0)
-			{
-				(void) WaitLatchOrSocket(
-										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
-			}
-			else
-			{
-				Assert(buf[0] == 'w' || buf[0] == 'k');
-				if (buf[0] == 'k')
-					continue; /* keepalive */
-				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
-					   sizeof rec_start_lsn);
-				rec_start_lsn = pg_ntoh64(rec_start_lsn);
-				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-
-				/* write WAL to disk */
-				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
-
-				ereport(DEBUG1,
-						(errmsg("Recover message %X/%X length %d",
-								LSN_FORMAT_ARGS(rec_start_lsn), len)));
-				if (rec_end_lsn >= endpos)
-					break;
-			}
-		}
-		ereport(LOG,
-				(errmsg("end of replication stream at %X/%X: %m",
-						LSN_FORMAT_ARGS(rec_end_lsn))));
-		walrcv_disconnect(wrconn);
-
-		/* failed to receive all WAL till endpos */
-		if (rec_end_lsn < endpos)
-			return false;
-	}
-	else
-	{
-		ereport(LOG,
-				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
-						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
-		return false;
-	}
-
-	return true;
-}
-
-/*
- * Determine for sk the starting streaming point and send it message
- * 1) Announcing we are elected proposer (which immediately advances epoch if
- *    safekeeper is synced, being important for sync-safekeepers)
- * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
- *    beyond it -- and history of term switching.
- *
- * Sets sk->startStreamingAt.
- */
-static void
-SendProposerElected(Safekeeper *sk)
-{
-	ProposerElected msg;
-	TermHistory *th;
-	term_t lastCommonTerm;
-	int i;
-
-	/*
-	 * Determine start LSN by comparing safekeeper's log term switch history and
-	 * proposer's, searching for the divergence point.
-	 *
-	 * Note: there is a vanishingly small chance of no common point even if
-	 * there is some WAL on safekeeper, if immediately after bootstrap compute
-	 * wrote some WAL on single sk and died; we stream since the beginning then.
-	 */
-	th = &sk->voteResponse.termHistory;
-	/*
-	 * If any WAL is present on the sk, it must be authorized by some term.
-	 * OTOH, without any WAL there are no term swiches in the log.
-	 */
-	Assert((th->n_entries == 0) ==
-		   (sk->voteResponse.flushLsn == InvalidXLogRecPtr));
-	/* We must start somewhere. */
-	Assert(propTermHistory.n_entries >= 1);
-
-	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
-	{
-		if (propTermHistory.entries[i].term != th->entries[i].term)
-			break;
-		/* term must begin everywhere at the same point */
-		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
-	}
-	i--; /* step back to the last common term */
-	if (i < 0)
-	{
-		/* safekeeper is empty or no common point, start from the beginning */
-		sk->startStreamingAt = propTermHistory.entries[0].lsn;
-
-		if (sk->startStreamingAt < truncateLsn)
-		{
-			/*
-			 * There's a gap between the WAL starting point and a truncateLsn,
-			 * which can't appear in a normal working cluster. That gap means
-			 * that all safekeepers reported that they have persisted WAL up
-			 * to the truncateLsn before, but now current safekeeper tells
-			 * otherwise.
-			 *
-			 * Also we have a special condition here, which is empty safekeeper
-			 * with no history. In combination with a gap, that can happen when
-			 * we introduce a new safekeeper to the cluster. This is a rare case,
-			 * which is triggered manually for now, and should be treated with
-			 * care.
-			 */
-
-			/*
-			 * truncateLsn will not change without ack from current safekeeper,
-			 * and it's aligned to the WAL record, so we can safely start
-			 * streaming from this point.
-			 */
-			sk->startStreamingAt = truncateLsn;
-
-			elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-				 sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn),
-				 LSN_FORMAT_ARGS(sk->startStreamingAt));
-		}
-	}
-	else
-	{
-		/*
-		 * End of (common) term is the start of the next except it is the last
-		 * one; there it is flush_lsn in case of safekeeper or, in case of
-		 * proposer, LSN it is currently writing, but then we just pick
-		 * safekeeper pos as it obviously can't be higher.
-		 */
-		if (propTermHistory.entries[i].term == propTerm)
-		{
-			sk->startStreamingAt = sk->voteResponse.flushLsn;
-		}
-		else
-		{
-			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
-			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
-														   sk->voteResponse.flushLsn);
-			sk->startStreamingAt = Min(propEndLsn, skEndLsn);
-		}
-	}
-
-	Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn);
-
-	msg.tag = 'e';
-	msg.term = propTerm;
-	msg.startStreamingAt = sk->startStreamingAt;
-	msg.termHistory = &propTermHistory;
-	msg.timelineStartLsn = timelineStartLsn;
-
-	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
-	elog(LOG,
-		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
-
-	resetStringInfo(&sk->outbuf);
-	pq_sendint64_le(&sk->outbuf, msg.tag);
-	pq_sendint64_le(&sk->outbuf, msg.term);
-	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
-	for (int i = 0; i < msg.termHistory->n_entries; i++)
-	{
-		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
-	}
-	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
-
-	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
-		return;
-
-	StartStreaming(sk);
-}
-
-/*
- * Start walsender streaming replication
- */
-static void
-WalProposerStartStreaming(XLogRecPtr startpos)
-{
-	StartReplicationCmd cmd;
-
-	elog(LOG, "WAL proposer starts streaming at %X/%X",
-		 LSN_FORMAT_ARGS(startpos));
-	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = greetRequest.timeline;
-	cmd.startpoint = startpos;
-	StartProposerReplication(&cmd);
-}
-
-/*
- * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets
- * correct event set.
- */
-static void
-StartStreaming(Safekeeper *sk)
-{
-	/*
-	 * This is the only entrypoint to state SS_ACTIVE. It's executed
-	 * exactly once for a connection.
-	 */
-	sk->state = SS_ACTIVE;
-	sk->streamingAt = sk->startStreamingAt;
-
-	/* event set will be updated inside SendMessageToNode */
-	SendMessageToNode(sk);
-}
-
-/*
- * Try to send message to the particular node. Always updates event set. Will
- * send at least one message, if socket is ready.
- *
- * Can be used only for safekeepers in SS_ACTIVE state. State can be changed
- * in case of errors.
- */
-static void
-SendMessageToNode(Safekeeper *sk)
-{
-	Assert(sk->state == SS_ACTIVE);
-
-	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	HandleActiveState(sk, WL_SOCKET_WRITEABLE);
-}
-
-/*
- * Broadcast new message to all caught-up safekeepers
- */
-static void
-BroadcastAppendRequest()
-{
-	for (int i = 0; i < n_safekeepers; i++)
-		if (safekeeper[i].state == SS_ACTIVE)
-			SendMessageToNode(&safekeeper[i]);
-}
-
-static void
-PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
-{
-	Assert(endLsn >= beginLsn);
-	req->tag = 'a';
-	req->term = propTerm;
-	req->epochStartLsn = propEpochStartLsn;
-	req->beginLsn = beginLsn;
-	req->endLsn = endLsn;
-	req->commitLsn = GetAcknowledgedByQuorumWALPosition();
-	req->truncateLsn = truncateLsn;
-	req->proposerId = greetRequest.proposerId;
-}
-
-/*
- * Process all events happened in SS_ACTIVE state, update event set after that.
- */
-static void
-HandleActiveState(Safekeeper *sk, uint32 events)
-{
-	uint32 newEvents = WL_SOCKET_READABLE;
-
-	if (events & WL_SOCKET_WRITEABLE)
-		if (!SendAppendRequests(sk))
-			return;
-
-	if (events & WL_SOCKET_READABLE)
-		if (!RecvAppendResponses(sk))
-			return;
-
-	/*
-	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
-	 * in the buffer.
-	 *
-	 * LSN comparison checks if we have pending unsent messages. This check isn't
-	 * necessary now, because we always send append messages immediately after
-	 * arrival. But it's good to have it here in case we change this behavior
-	 * in the future.
-	 */
-	if (sk->streamingAt != availableLsn || sk->flushWrite)
-		newEvents |= WL_SOCKET_WRITEABLE;
-
-	UpdateEventSet(sk, newEvents);
-}
-
-/*
- * Send WAL messages starting from sk->streamingAt until the end or non-writable
- * socket, whichever comes first. Caller should take care of updating event set.
- * Even if no unsent WAL is available, at least one empty message will be sent
- * as a heartbeat, if socket is ready.
- *
- * Can change state if Async* functions encounter errors and reset connection.
- * Returns false in this case, true otherwise.
- */
-static bool
-SendAppendRequests(Safekeeper *sk)
-{
-	XLogRecPtr endLsn;
-	AppendRequestHeader *req;
-	PGAsyncWriteResult writeResult;
-	WALReadError errinfo;
-	bool sentAnything = false;
-
-	if (sk->flushWrite)
-	{
-		if (!AsyncFlush(sk))
-			/*
-			 * AsyncFlush failed, that could happen if the socket is closed or
-			 * we have nothing to write and should wait for writeable socket.
-			 */
-			return sk->state == SS_ACTIVE;
-
-		/* Event set will be updated in the end of HandleActiveState */
-		sk->flushWrite = false;
-	}
-
-	while (sk->streamingAt != availableLsn || !sentAnything)
-	{
-		sentAnything = true;
-
-		endLsn = sk->streamingAt;
-		endLsn += MAX_SEND_SIZE;
-
-		/* if we went beyond available WAL, back off */
-		if (endLsn > availableLsn) {
-			endLsn = availableLsn;
-		}
-
-		req = &sk->appendRequest;
-		PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn);
-
-		ereport(DEBUG2,
-				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						req->endLsn - req->beginLsn,
-						LSN_FORMAT_ARGS(req->beginLsn),
-						LSN_FORMAT_ARGS(req->endLsn),
-						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port)));
-
-		resetStringInfo(&sk->outbuf);
-
-		/* write AppendRequest header */
-		appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader));
-
-		/* write the WAL itself */
-		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
-		if (!WALRead(sk->xlogreader,
-				 &sk->outbuf.data[sk->outbuf.len],
-				 req->beginLsn,
-				 req->endLsn - req->beginLsn,
-				 ThisTimeLineID,
-				 &errinfo))
-		{
-			WALReadRaiseError(&errinfo);
-		}
-		sk->outbuf.len += req->endLsn - req->beginLsn;
-
-		writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
-
-		/* Mark current message as sent, whatever the result is */
-		sk->streamingAt = endLsn;
-
-		switch (writeResult)
-		{
-			case PG_ASYNC_WRITE_SUCCESS:
-				/* Continue writing the next message */
-				break;
-
-			case PG_ASYNC_WRITE_TRY_FLUSH:
-				/*
-				 * We still need to call PQflush some more to finish the job.
-				 * Caller function will handle this by setting right event set.
-				 */
-				sk->flushWrite = true;
-				return true;
-
-			case PG_ASYNC_WRITE_FAIL:
-				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk->state),
-					walprop_error_message(sk->conn));
-				ShutdownConnection(sk);
-				return false;
-			default:
-				Assert(false);
-				return false;
-		}
-	}
-
-	return true;
-}
-
-/*
- * Receive and process all available feedback.
- *
- * Can change state if Async* functions encounter errors and reset connection.
- * Returns false in this case, true otherwise.
- *
- * NB: This function can call SendMessageToNode and produce new messages.
- */
-static bool
-RecvAppendResponses(Safekeeper *sk)
-{
-	XLogRecPtr	minQuorumLsn;
-	bool readAnything = false;
-
-	while (true)
-	{
-		/*
-		 * If our reading doesn't immediately succeed, any
-		 * necessary error handling or state setting is taken care
-		 * of. We can leave any other work until later.
-		 */
-		sk->appendResponse.apm.tag = 'a';
-		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
-			break;
-
-		ereport(DEBUG2,
-				(errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-						sk->appendResponse.term,
-						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-						sk->host, sk->port)));
-
-		if (sk->appendResponse.term > propTerm)
-		{
-			/* Another compute with higher term is running. */
-			elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->appendResponse.term, propTerm);
-		}
-
-		readAnything = true;
-	}
-
-	if (!readAnything)
-		return sk->state == SS_ACTIVE;
-
-	HandleSafekeeperResponse();
-
-	/*
-	 * Also send the new commit lsn to all the safekeepers.
-	 */
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	if (minQuorumLsn > lastSentCommitLsn)
-	{
-		BroadcastAppendRequest();
-		lastSentCommitLsn = minQuorumLsn;
-	}
-
-	return sk->state == SS_ACTIVE;
-}
-
-/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
-void
-ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf)
-{
-	uint8 nkeys;
-	int i;
-	int32 len;
-
-	/* get number of custom keys */
-	nkeys = pq_getmsgbyte(reply_message);
-
-	for (i = 0; i < nkeys; i++)
-	{
-		const char *key = pq_getmsgstring(reply_message);
-		if (strcmp(key, "current_timeline_size") == 0)
-		{
-				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				rf->currentClusterSize = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
-					rf->currentClusterSize);
-		}
-		else if (strcmp(key, "ps_writelsn") == 0)
-		{
-				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				rf->ps_writelsn = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
-					LSN_FORMAT_ARGS(rf->ps_writelsn));
-		}
-		else if (strcmp(key, "ps_flushlsn") == 0)
-		{
-				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				rf->ps_flushlsn = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
-					LSN_FORMAT_ARGS(rf->ps_flushlsn));
-		}
-		else if (strcmp(key, "ps_applylsn") == 0)
-		{
-				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				rf->ps_applylsn = pq_getmsgint64(reply_message);
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
-					LSN_FORMAT_ARGS(rf->ps_applylsn));
-		}
-		else if (strcmp(key, "ps_replytime") == 0)
-		{
-			pq_getmsgint(reply_message, sizeof(int32)); // read value length
-			rf->ps_replytime = pq_getmsgint64(reply_message);
-			{
-				char	   *replyTimeStr;
-
-				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
-					rf->ps_replytime, replyTimeStr);
-
-				pfree(replyTimeStr);
-			}
-		}
-		else
-		{
-			len = pq_getmsgint(reply_message, sizeof(int32)); // read value length
-			// Skip unknown keys to support backward compatibile protocol changes
-			elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
-			pq_getmsgbytes(reply_message, len);
-		};
-	}
-}
-
-/*
- * Combine hot standby feedbacks from all safekeepers.
- */
-static void
-CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
-{
-	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
-
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].appendResponse.hs.ts != 0)
-		{
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
-			{
-				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
-			}
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
-			{
-				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
-			}
-		}
-	}
-}
-
-
-/*
- * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
- * last WAL record that can be safely discarded.
- */
-static XLogRecPtr
-CalculateMinFlushLsn(void)
-{
-	XLogRecPtr lsn = n_safekeepers > 0
-		? safekeeper[0].appendResponse.flushLsn
-		: InvalidXLogRecPtr;
-	for (int i = 1; i < n_safekeepers; i++)
-	{
-		lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn);
-	}
-	return lsn;
-}
-
-/*
- * Calculate WAL position acknowledged by quorum
- */
-static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(void)
-{
-	XLogRecPtr	responses[MAX_SAFEKEEPERS];
-
-	/*
-	 * Sort acknowledged LSNs
-	 */
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		/*
-		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
-		 */
-		responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ?
-			safekeeper[i].appendResponse.flushLsn : 0;
-	}
-	qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
-
-	/*
-	 * Get the smallest LSN committed by quorum
-	 */
-	return responses[n_safekeepers - quorum];
-}
-
-/*
- * ReplicationFeedbackShmemSize --- report amount of shared memory space needed
- */
-Size
-WalproposerShmemSize(void)
-{
-	return sizeof(WalproposerShmemState);
-}
-
-bool
-WalproposerShmemInit(void)
-{
-	bool		found;
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	walprop_shared = ShmemInitStruct("Walproposer shared state",
-								sizeof(WalproposerShmemState),
-								&found);
-
-	if (!found)
-	{
-		memset(walprop_shared, 0, WalproposerShmemSize());
-		SpinLockInit(&walprop_shared->mutex);
-	}
-	LWLockRelease(AddinShmemInitLock);
-
-	return found;
-}
-
-void
-replication_feedback_set(ReplicationFeedback *rf)
-{
-	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
-	SpinLockRelease(&walprop_shared->mutex);
-}
-
-
-void
-replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
-{
-	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.ps_writelsn;
-	*flushLsn = walprop_shared->feedback.ps_flushlsn;
-	*applyLsn = walprop_shared->feedback.ps_applylsn;
-	SpinLockRelease(&walprop_shared->mutex);
-}
-
-
-/*
- * Get ReplicationFeedback fields from the most advanced safekeeper
- */
-static void
-GetLatestZentihFeedback(ReplicationFeedback *rf)
-{
-	int latest_safekeeper = 0;
-	XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
-		{
-			latest_safekeeper = i;
-			ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
-		}
-	}
-
-	rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
-	rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
-	rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
-	rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
-
-	elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu,"
-			  " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
-		rf->currentClusterSize,
-		LSN_FORMAT_ARGS(rf->ps_writelsn),
-		LSN_FORMAT_ARGS(rf->ps_flushlsn),
-		LSN_FORMAT_ARGS(rf->ps_applylsn),
-		rf->ps_replytime);
-
-	replication_feedback_set(rf);
-}
-
-static void
-HandleSafekeeperResponse(void)
-{
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	minQuorumLsn;
-	XLogRecPtr	diskConsistentLsn;
-	XLogRecPtr  minFlushLsn;
-
-
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
-
-	if (!syncSafekeepers)
-	{
-		// Get ReplicationFeedback fields from the most advanced safekeeper
-		GetLatestZentihFeedback(&quorumFeedback.rf);
-		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-	}
-
-	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
-	{
-
-		if (minQuorumLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = minQuorumLsn;
-
-		/* advance the replication slot */
-		if (!syncSafekeepers)
-			ProcessStandbyReply(
-								// write_lsn -  This is what durably stored in WAL service.
-								quorumFeedback.flushLsn,
-								//flush_lsn - This is what durably stored in WAL service.
-								quorumFeedback.flushLsn,
-								//apply_lsn - This is what processed and durably saved at pageserver.
-								quorumFeedback.rf.ps_flushlsn,
-								GetCurrentTimestamp(), false);
-	}
-
-	CombineHotStanbyFeedbacks(&hsFeedback);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
-	{
-		quorumFeedback.hs = hsFeedback;
-		if (!syncSafekeepers)
-			ProcessStandbyHSFeedback(hsFeedback.ts,
-									 XidFromFullTransactionId(hsFeedback.xmin),
-									 EpochFromFullTransactionId(hsFeedback.xmin),
-									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
-	}
-
-	/*
-	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the
-	 * beginning of the record, which simplifies decoding on the far end.
-	 *
-	 * Advanced truncateLsn should be not further than nearest commitLsn.
-	 * This prevents surprising violation of truncateLsn <= commitLsn
-	 * invariant which might occur because 1) truncateLsn can be advanced
-	 * immediately once chunk is broadcast to all safekeepers, and
-	 * commitLsn generally can't be advanced based on feedback from
-	 * safekeeper who is still in the previous epoch (similar to 'leader
-	 * can't commit entries from previous term' in Raft); 2) chunks we
-	 * read from WAL and send are plain sheets of bytes, but safekeepers
-	 * ack only on record boundaries.
-	 */
-	minFlushLsn = CalculateMinFlushLsn();
-	if (minFlushLsn > truncateLsn)
-	{
-		truncateLsn = minFlushLsn;
-
-		/*
-		 * Advance the replication slot to free up old WAL files. Note
-		 * that slot doesn't exist if we are in syncSafekeepers mode.
-		 */
-		if (MyReplicationSlot)
-			PhysicalConfirmReceivedLocation(truncateLsn);
-	}
-
-	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
-	 * basically the random one gets connected, to prevent hanging basebackup
-	 * (due to pageserver connecting to not-synced-safekeeper) we currently
-	 * wait for all seemingly alive safekeepers to get synced.
-	 */
-	if (syncSafekeepers)
-	{
-		int			n_synced;
-
-		n_synced = 0;
-		for (int i = 0; i < n_safekeepers; i++)
-		{
-			Safekeeper  *sk = &safekeeper[i];
-			bool		synced = sk->appendResponse.commitLsn >= propEpochStartLsn;
-
-			/* alive safekeeper which is not synced yet; wait for it */
-			if (sk->state != SS_OFFLINE && !synced)
-				return;
-			if (synced)
-				n_synced++;
-		}
-		if (n_synced >= quorum)
-		{
-			/* All safekeepers synced! */
-			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-			exit(0);
-		}
-	}
-}
-
-/*
- * Try to read CopyData message from i'th safekeeper, resetting connection on
- * failure.
- */
-static bool
-AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
-{
-	switch (walprop_async_read(sk->conn, buf, buf_size))
-	{
-		case PG_ASYNC_READ_SUCCESS:
-			return true;
-
-		case PG_ASYNC_READ_TRY_AGAIN:
-			/* WL_SOCKET_READABLE is always set during copyboth */
-			return false;
-
-		case PG_ASYNC_READ_FAIL:
-			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state),
-				 walprop_error_message(sk->conn));
-			ShutdownConnection(sk);
-			return false;
-	}
-	Assert(false);
-	return false;
-}
-
-/*
- * Read next message with known type into provided struct, by reading a CopyData
- * block from the safekeeper's postgres connection, returning whether the read
- * was successful.
- *
- * If the read needs more polling, we return 'false' and keep the state
- * unmodified, waiting until it becomes read-ready to try again. If it fully
- * failed, a warning is emitted and the connection is reset.
- */
-static bool
-AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
-{
-	char *buf;
-	int buf_size;
-	uint64 tag;
-	StringInfoData s;
-
-	if (!(AsyncRead(sk, &buf, &buf_size)))
-		return false;
-
-	/* parse it */
-	s.data = buf;
-	s.len = buf_size;
-	s.cursor = 0;
-
-	tag = pq_getmsgint64_le(&s);
-	if (tag != anymsg->tag)
-	{
-		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-			 sk->port, FormatSafekeeperState(sk->state));
-		ResetConnection(sk);
-		return false;
-	}
-
-	switch (tag)
-	{
-		case 'g':
-		{
-			AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
-			msg->term = pq_getmsgint64_le(&s);
-			msg->nodeId = pq_getmsgint64_le(&s);
-			pq_getmsgend(&s);
-			return true;
-		}
-
-		case 'v':
-		{
-			VoteResponse *msg = (VoteResponse *) anymsg;
-
-			msg->term = pq_getmsgint64_le(&s);
-			msg->voteGiven = pq_getmsgint64_le(&s);
-			msg->flushLsn = pq_getmsgint64_le(&s);
-			msg->truncateLsn = pq_getmsgint64_le(&s);
-			msg->termHistory.n_entries = pq_getmsgint32_le(&s);
-			msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
-			for (int i = 0; i < msg->termHistory.n_entries; i++)
-			{
-				msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
-				msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
-			}
-			msg->timelineStartLsn = pq_getmsgint64_le(&s);
-			pq_getmsgend(&s);
-			return true;
-		}
-
-		case 'a':
-		{
-			AppendResponse *msg = (AppendResponse *) anymsg;
-			msg->term = pq_getmsgint64_le(&s);
-			msg->flushLsn = pq_getmsgint64_le(&s);
-			msg->commitLsn = pq_getmsgint64_le(&s);
-			msg->hs.ts = pq_getmsgint64_le(&s);
-			msg->hs.xmin.value = pq_getmsgint64_le(&s);
-			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-				ParseReplicationFeedbackMessage(&s, &msg->rf);
-			pq_getmsgend(&s);
-			return true;
-		}
-
-		default:
-		{
-			Assert(false);
-			return false;
-		}
-	}
-}
-
-/*
- * Blocking equivalent to AsyncWrite.
- *
- * We use this everywhere messages are small enough that they should fit in a
- * single packet.
- */
-static bool
-BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
-{
-	uint32		events;
-
-	if (!walprop_blocking_write(sk->conn, msg, msg_size))
-	{
-		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-			 sk->host, sk->port, FormatSafekeeperState(sk->state),
-			 walprop_error_message(sk->conn));
-		ShutdownConnection(sk);
-		return false;
-	}
-
-	sk->state = success_state;
-
-	/*
-	 * If the new state will be waiting for events to happen, update the event
-	 * set to wait for those
-	 */
-	events = SafekeeperStateDesiredEvents(success_state);
-	if (events)
-		UpdateEventSet(sk, events);
-
-	return true;
-}
-
-/*
- * Starts a write into the 'i'th safekeeper's postgres connection, moving to
- * flush_state (adjusting eventset) if write still needs flushing.
- *
- * Returns false if sending is unfinished (requires flushing or conn failed).
- * Upon failure, a warning is emitted and the connection is reset.
- */
-static bool
-AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state)
-{
-	switch (walprop_async_write(sk->conn, msg, msg_size))
-	{
-		case PG_ASYNC_WRITE_SUCCESS:
-			return true;
-		case PG_ASYNC_WRITE_TRY_FLUSH:
-
-			/*
-			 * We still need to call PQflush some more to finish the job; go
-			 * to the appropriate state. Update the event set at the bottom of
-			 * this function
-			 */
-			sk->state = flush_state;
-			UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
-			return false;
-		case PG_ASYNC_WRITE_FAIL:
-			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 walprop_error_message(sk->conn));
-			ShutdownConnection(sk);
-			return false;
-		default:
-		    Assert(false);
-			return false;
-	}
-}
-
-/*
- * Flushes a previous call to AsyncWrite. This only needs to be called when the
- * socket becomes read or write ready *after* calling AsyncWrite.
- *
- * If flushing successfully completes returns true, otherwise false. Event set
- * is updated only if connection fails, otherwise caller should manually unset
- * WL_SOCKET_WRITEABLE.
- */
-static bool
-AsyncFlush(Safekeeper *sk)
-{
-	/*---
-	 * PQflush returns:
-	 *   0 if successful                    [we're good to move on]
-	 *   1 if unable to send everything yet [call PQflush again]
-	 *  -1 if it failed                     [emit an error]
-	 */
-	switch (walprop_flush(sk->conn))
-	{
-		case 0:
-			/* flush is done */
-			return true;
-		case 1:
-			/* Nothing to do; try again when the socket's ready */
-			return false;
-		case -1:
-			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 walprop_error_message(sk->conn));
-			ResetConnection(sk);
-			return false;
-		default:
-			Assert(false);
-			return false;
-	}
-}
-
-// Check if we need to suspend inserts because of lagging replication.
-static uint64
-backpressure_lag_impl(void)
-{
-	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
-	{
-		XLogRecPtr writePtr;
-		XLogRecPtr flushPtr;
-		XLogRecPtr applyPtr;
-		XLogRecPtr myFlushLsn = GetFlushRecPtr();
-
-		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
-#define MB ((XLogRecPtr)1024*1024)
-
-		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
-			 LSN_FORMAT_ARGS(myFlushLsn),
-			 LSN_FORMAT_ARGS(writePtr),
-			 LSN_FORMAT_ARGS(flushPtr),
-			 LSN_FORMAT_ARGS(applyPtr));
-
-		if ((writePtr != InvalidXLogRecPtr
-			 && max_replication_write_lag > 0
-			 && myFlushLsn > writePtr + max_replication_write_lag*MB))
-		{
-			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
-		}
-
-		if ((flushPtr != InvalidXLogRecPtr
-			 && max_replication_flush_lag > 0
-			 && myFlushLsn > flushPtr + max_replication_flush_lag*MB))
-		{
-			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
-		}
-
-		if ((applyPtr != InvalidXLogRecPtr
-			 && max_replication_apply_lag > 0
-			 && myFlushLsn > applyPtr + max_replication_apply_lag*MB))
-		{
-			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
-		}
-	}
-	return 0;
-}
diff --git a/contrib/neon/walproposer.h b/contrib/neon/walproposer.h
deleted file mode 100644
index b684d5264f7..00000000000
--- a/contrib/neon/walproposer.h
+++ /dev/null
@@ -1,540 +0,0 @@
-#ifndef __NEON_WALPROPOSER_H__
-#define __NEON_WALPROPOSER_H__
-
-#include "access/xlogdefs.h"
-#include "postgres.h"
-#include "port.h"
-#include "access/xlog_internal.h"
-#include "access/transam.h"
-#include "nodes/replnodes.h"
-#include "utils/uuid.h"
-#include "replication/walreceiver.h"
-
-#define SK_MAGIC              0xCafeCeefu
-#define SK_PROTOCOL_VERSION   2
-
-#define MAX_SAFEKEEPERS        32
-#define MAX_SEND_SIZE         (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
-#define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
-#define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
-#define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
-
-/*
- * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
- * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
- */
-#define WL_NO_EVENTS 0
-
-extern char* wal_acceptors_list;
-extern int   wal_acceptor_reconnect_timeout;
-extern int   wal_acceptor_connect_timeout;
-extern bool  am_wal_proposer;
-
-struct WalProposerConn; /* Defined in libpqwalproposer */
-typedef struct WalProposerConn WalProposerConn;
-
-struct WalMessage;
-typedef struct WalMessage WalMessage;
-
-extern char *zenith_timeline_walproposer;
-extern char *zenith_tenant_walproposer;
-
-/* Possible return values from ReadPGAsync */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-	/* The read is ongoing. Wait until the connection is read-ready, then try
-	 * again. */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from WritePGAsync */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-	/* The write started, but you'll need to call PQflush some more times
-	 * to finish it off. We just tried, so it's best to wait until the
-	 * connection is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
-/*
- * WAL safekeeper state, which is used to wait for some event.
- *
- * States are listed here in the order that they're executed.
- *
- * Most states, upon failure, will move back to SS_OFFLINE by calls to
- * ResetConnection or ShutdownConnection.
- */
-typedef enum
-{
-	/*
-	 * Does not have an active connection and will stay that way until
-	 * further notice.
-	 *
-	 * Moves to SS_CONNECTING_WRITE by calls to ResetConnection.
-	 */
-	SS_OFFLINE,
-
-	/*
-	 * Connecting states. "_READ" waits for the socket to be available for
-	 * reading, "_WRITE" waits for writing. There's no difference in the code
-	 * they execute when polled, but we have this distinction in order to
-	 * recreate the event set in HackyRemoveWalProposerEvent.
-	 *
-	 * After the connection is made, "START_WAL_PUSH" query is sent.
-	 */
-	SS_CONNECTING_WRITE,
-	SS_CONNECTING_READ,
-
-	/*
-	 * Waiting for the result of the "START_WAL_PUSH" command.
-	 *
-	 * After we get a successful result, sends handshake to safekeeper.
-	 */
-	SS_WAIT_EXEC_RESULT,
-
-	/*
-	 * Executing the receiving half of the handshake. After receiving, moves to
-	 * SS_VOTING.
-	 */
-	SS_HANDSHAKE_RECV,
-
-	/*
-	 * Waiting to participate in voting, but a quorum hasn't yet been reached.
-	 * This is an idle state - we do not expect AdvancePollState to be called.
-	 *
-	 * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
-	 * quorum of handshakes.
-	 */
-	SS_VOTING,
-
-	/*
-	 * Already sent voting information, waiting to receive confirmation from the
-	 * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet.
-	 */
-	SS_WAIT_VERDICT,
-
-	/* Need to flush ProposerElected message. */
-	SS_SEND_ELECTED_FLUSH,
-
-	/*
-	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
-	 * read-ready, the connection has been closed.
-	 *
-	 * Moves to SS_ACTIVE only by call to StartStreaming.
-	 */
-	SS_IDLE,
-
-	/*
-	 * Active phase, when we acquired quorum and have WAL to send or feedback
-	 * to read.
-	 */
-	SS_ACTIVE,
-} SafekeeperState;
-
-/* Consensus logical timestamp. */
-typedef uint64 term_t;
-
-/* neon storage node id */
-typedef uint64 NNodeId;
-
-/*
- * Proposer <-> Acceptor messaging.
- */
-
-/* Initial Proposer -> Acceptor message */
-typedef struct ProposerGreeting
-{
-	uint64	   tag;				  /* message tag */
-	uint32	   protocolVersion;	  /* proposer-safekeeper protocol version */
-	uint32	   pgVersion;
-	pg_uuid_t  proposerId;
-	uint64	   systemId;		  /* Postgres system identifier */
-	uint8	   ztimelineid[16];	  /* Zenith timeline id */
-	uint8	   ztenantid[16];
-	TimeLineID timeline;
-	uint32	   walSegSize;
-} ProposerGreeting;
-
-typedef struct AcceptorProposerMessage
-{
-	uint64 tag;
-} AcceptorProposerMessage;
-
-/*
- * Acceptor -> Proposer initial response: the highest term acceptor voted for.
- */
-typedef struct AcceptorGreeting
-{
-	AcceptorProposerMessage apm;
-	term_t		term;
-	NNodeId		nodeId;
-} AcceptorGreeting;
-
-/*
- * Proposer -> Acceptor vote request.
- */
-typedef struct VoteRequest
-{
-	uint64		tag;
-	term_t		term;
-	pg_uuid_t   proposerId; /* for monitoring/debugging */
-} VoteRequest;
-
-/* Element of term switching chain. */
-typedef struct TermSwitchEntry
-{
-	term_t term;
-	XLogRecPtr lsn;
-} TermSwitchEntry;
-
-typedef struct TermHistory
-{
-	uint32 n_entries;
-	TermSwitchEntry *entries;
-} TermHistory;
-
-/* Vote itself, sent from safekeeper to proposer */
-typedef struct VoteResponse {
-	AcceptorProposerMessage apm;
-	term_t term;
-	uint64 voteGiven;
-	/*
-	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
-     * proposer to choose the most advanced one.
-	 */
-	XLogRecPtr flushLsn;
-	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some safekeeper */
-	TermHistory termHistory;
-	XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
-} VoteResponse;
-
-/*
- * Proposer -> Acceptor message announcing proposer is elected and communicating
- * epoch history to it.
- */
-typedef struct ProposerElected
-{
-	uint64 tag;
-	term_t term;
-	/* proposer will send since this point */
-	XLogRecPtr startStreamingAt;
-	/* history of term switches up to this proposer */
-	TermHistory *termHistory;
-	/* timeline globally starts at this LSN */
-	XLogRecPtr timelineStartLsn;
-} ProposerElected;
-
-/*
- * Header of request with WAL message sent from proposer to safekeeper.
- */
-typedef struct AppendRequestHeader
-{
-	uint64 tag;
-	term_t term; /* term of the proposer */
-	/*
-	 * LSN since which current proposer appends WAL (begin_lsn of its first
-	 * record); determines epoch switch point.
-	 */
-	XLogRecPtr epochStartLsn;
-	XLogRecPtr beginLsn;    /* start position of message in WAL */
-	XLogRecPtr endLsn;      /* end position of message in WAL */
-	XLogRecPtr commitLsn;   /* LSN committed by quorum of safekeepers */
-	/*
-	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
-	 *  + 1 of last chunk streamed to everyone)
-	 */
-    XLogRecPtr truncateLsn;
-    pg_uuid_t  proposerId; /* for monitoring/debugging */
-} AppendRequestHeader;
-
-/*
- * Hot standby feedback received from replica
- */
-typedef struct HotStandbyFeedback
-{
-	TimestampTz       ts;
-	FullTransactionId xmin;
-	FullTransactionId catalog_xmin;
-} HotStandbyFeedback;
-
-
-typedef	struct ReplicationFeedback
-{
-	// current size of the timeline on pageserver
-	uint64 currentClusterSize;
-	// standby_status_update fields that safekeeper received from pageserver
-	XLogRecPtr ps_writelsn;
-	XLogRecPtr ps_flushlsn;
-	XLogRecPtr ps_applylsn;
-	TimestampTz ps_replytime;
-} ReplicationFeedback;
-
-
-typedef struct WalproposerShmemState
-{
-	slock_t		mutex;
-	ReplicationFeedback feedback;
-	term_t		mineLastElectedTerm;
-} WalproposerShmemState;
-
-/*
- * Report safekeeper state to proposer
- */
-typedef struct AppendResponse
-{
-	AcceptorProposerMessage apm;
-	/*
-	 * Current term of the safekeeper; if it is higher than proposer's, the
-	 * compute is out of date.
-	 */
-	term_t     term;
-	// TODO: add comment
-	XLogRecPtr flushLsn;
-	// Safekeeper reports back his awareness about which WAL is committed, as
-	// this is a criterion for walproposer --sync mode exit
-	XLogRecPtr commitLsn;
-	HotStandbyFeedback hs;
-	// Feedback recieved from pageserver includes standby_status_update fields
-	// and custom zenith feedback.
-	// This part of the message is extensible.
-	ReplicationFeedback rf;
-} AppendResponse;
-
-// ReplicationFeedback is extensible part of the message that is parsed separately
-// Other fields are fixed part
-#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
-
-
-/*
- * Descriptor of safekeeper
- */
-typedef struct Safekeeper
-{
-	char const*        host;
-	char const*        port;
-	char               conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
-
-	/*
-	 * postgres protocol connection to the WAL acceptor
-	 *
-	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
-	 * reach SS_ACTIVE; not before.
-	 */
-	WalProposerConn*   conn;
-	/*
-	 * Temporary buffer for the message being sent to the safekeeper.
-	 */
-	StringInfoData outbuf;
-	/*
-	 * WAL reader, allocated for each safekeeper.
-	 */
-	XLogReaderState* xlogreader;
-
-	/*
-	 * Streaming will start here; must be record boundary.
-	 */
-	XLogRecPtr startStreamingAt;
-
-	bool                flushWrite;     /* set to true if we need to call AsyncFlush, to flush pending messages */
-	XLogRecPtr          streamingAt;    /* current streaming position */
-	AppendRequestHeader appendRequest;  /* request for sending to safekeeper */
-
-	int                 eventPos;       /* position in wait event set. Equal to -1 if no event */
-	SafekeeperState     state;          /* safekeeper state machine state */
-	TimestampTz         startedConnAt;  /* when connection attempt started */
-	AcceptorGreeting    greetResponse;  /* acceptor greeting */
-	VoteResponse        voteResponse;   /* the vote */
-	AppendResponse      appendResponse; /* feedback for master */
-} Safekeeper;
-
-
-extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
-void       WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-void       WalProposerPoll(void);
-void       WalProposerRegister(void);
-void ParseReplicationFeedbackMessage(StringInfo reply_message,
-								ReplicationFeedback *rf);
-extern void StartProposerReplication(StartReplicationCmd *cmd);
-
-Size WalproposerShmemSize(void);
-bool WalproposerShmemInit(void);
-void replication_feedback_set(ReplicationFeedback *rf);
-void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
-
-/* libpqwalproposer hooks & helper type */
-
-/* Re-exported PostgresPollingStatusType */
-typedef enum
-{
-	WP_CONN_POLLING_FAILED = 0,
-	WP_CONN_POLLING_READING,
-	WP_CONN_POLLING_WRITING,
-	WP_CONN_POLLING_OK,
-	/*
-	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
-	 * We've removed it here to avoid clutter.
-	 */
-} WalProposerConnectPollStatusType;
-
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-	/* Any success result other than a single CopyBoth was received. The specifics of the result
-	 * were already logged, but it may be useful to provide an error message indicating which
-	 * safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set. */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-	/* No result available at this time. Wait until read-ready, then call again. Internally, this is
-	 * returned when PQisBusy indicates that PQgetResult would block. */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
-/* Re-exported ConnStatusType */
-typedef enum
-{
-	WP_CONNECTION_OK,
-	WP_CONNECTION_BAD,
-
-	/*
-	 * The original ConnStatusType has many more tags, but requests that
-	 * they not be relied upon (except for displaying to the user). We
-	 * don't need that extra functionality, so we collect them into a
-	 * single tag here.
-	 */
-	WP_CONNECTION_IN_PROGRESS,
-} WalProposerConnStatusType;
-
-/* Re-exported PQerrorMessage */
-typedef char* (*walprop_error_message_fn) (WalProposerConn* conn);
-
-/* Re-exported PQstatus */
-typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn);
-
-/* Re-exported PQconnectStart */
-typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
-
-/* Re-exported PQconectPoll */
-typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
-
-/* Blocking wrapper around PQsendQuery */
-typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
-
-/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
-typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
-
-/* Re-exported PQsocket */
-typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
-
-/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-typedef int (*walprop_flush_fn) (WalProposerConn* conn);
-
-/* Re-exported PQfinish */
-typedef void (*walprop_finish_fn) (WalProposerConn* conn);
-
-/*
- * Ergonomic wrapper around PGgetCopyData
- *
- * Reads a CopyData block from a safekeeper, setting *amount to the number
- * of bytes returned.
- *
- * This function is allowed to assume certain properties specific to the
- * protocol with the safekeepers, so it should not be used as-is for any
- * other purpose.
- *
- * Note: If possible, using <AsyncRead> is generally preferred, because it
- * performs a bit of extra checking work that's always required and is normally
- * somewhat verbose.
- */
-typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
-													char** buf,
-													int* amount);
-
-/*
- * Ergonomic wrapper around PQputCopyData + PQflush
- *
- * Starts to write a CopyData block to a safekeeper.
- *
- * For information on the meaning of return codes, refer to PGAsyncWriteResult.
- */
-typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
-													  void const* buf,
-													  size_t size);
-
-/*
- * Blocking equivalent to walprop_async_write_fn
- *
- * Returns 'true' if successful, 'false' on failure.
- */
-typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size);
-
-/* All libpqwalproposer exported functions collected together. */
-typedef struct WalProposerFunctionsType
-{
-	walprop_error_message_fn	walprop_error_message;
-	walprop_status_fn			walprop_status;
-	walprop_connect_start_fn	walprop_connect_start;
-	walprop_connect_poll_fn		walprop_connect_poll;
-	walprop_send_query_fn		walprop_send_query;
-	walprop_get_query_result_fn	walprop_get_query_result;
-	walprop_socket_fn			walprop_socket;
-	walprop_flush_fn			walprop_flush;
-	walprop_finish_fn			walprop_finish;
-	walprop_async_read_fn		walprop_async_read;
-	walprop_async_write_fn		walprop_async_write;
-	walprop_blocking_write_fn   walprop_blocking_write;
-} WalProposerFunctionsType;
-
-/* Allow the above functions to be "called" with normal syntax */
-#define walprop_error_message(conn) \
-	WalProposerFunctions->walprop_error_message(conn)
-#define walprop_status(conn) \
-	WalProposerFunctions->walprop_status(conn)
-#define walprop_connect_start(conninfo) \
-	WalProposerFunctions->walprop_connect_start(conninfo)
-#define walprop_connect_poll(conn) \
-	WalProposerFunctions->walprop_connect_poll(conn)
-#define walprop_send_query(conn, query) \
-	WalProposerFunctions->walprop_send_query(conn, query)
-#define walprop_get_query_result(conn) \
-	WalProposerFunctions->walprop_get_query_result(conn)
-#define walprop_set_nonblocking(conn, arg) \
-	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
-#define walprop_socket(conn) \
-	WalProposerFunctions->walprop_socket(conn)
-#define walprop_flush(conn) \
-	WalProposerFunctions->walprop_flush(conn)
-#define walprop_finish(conn) \
-	WalProposerFunctions->walprop_finish(conn)
-#define walprop_async_read(conn, buf, amount) \
-	WalProposerFunctions->walprop_async_read(conn, buf, amount)
-#define walprop_async_write(conn, buf, size) \
-	WalProposerFunctions->walprop_async_write(conn, buf, size)
-#define walprop_blocking_write(conn, buf, size) \
-	WalProposerFunctions->walprop_blocking_write(conn, buf, size)
-
-/*
- * The runtime location of the libpqwalproposer functions.
- *
- * This pointer is set by the initializer in libpqwalproposer, so that we
- * can use it later.
- */
-extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
-
-#endif /* __NEON_WALPROPOSER_H__ */
diff --git a/contrib/neon/walproposer_utils.c b/contrib/neon/walproposer_utils.c
deleted file mode 100644
index cd8fd556c22..00000000000
--- a/contrib/neon/walproposer_utils.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-#include "postgres.h"
-
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
-#include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "walproposer_utils.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "../../src/interfaces/libpq/libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-/*
- * These variables are used similarly to openLogFile/SegNo,
- * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
- * corresponding the filename of walpropFile.
- */
-static int	walpropFile = -1;
-static TimeLineID walpropFileTLI = 0;
-static XLogSegNo walpropSegNo = 0;
-
-/* START cloned file-local variables and functions from walsender.c */
-
-/*
- * xlogreader used for replication.  Note that a WAL sender doing physical
- * replication does not need xlogreader to read WAL, but it needs one to
- * keep a state of its work.
- */
-static XLogReaderState *xlogreader = NULL;
-
-/*
- * These variables keep track of the state of the timeline we're currently
- * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
- * the timeline is not the latest timeline on this server, and the server's
- * history forked off from that timeline at sendTimeLineValidUpto.
- */
-static TimeLineID sendTimeLine = 0;
-static TimeLineID sendTimeLineNextTLI = 0;
-static bool sendTimeLineIsHistoric = false;
-static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
-
-/*
- * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
- * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
- */
-static TimestampTz last_reply_timestamp = 0;
-
-/* Have we sent a heartbeat message asking for reply, since last reply? */
-static bool waiting_for_ping_response = false;
-
-static bool streamingDoneSending;
-static bool streamingDoneReceiving;
-
-/* Are we there yet? */
-static bool WalSndCaughtUp = false;
-
-/* Flags set by signal handlers for later service in main loop */
-static volatile sig_atomic_t got_STOPPING = false;
-
-/*
- * How far have we sent WAL already? This is also advertised in
- * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
- */
-static XLogRecPtr sentPtr = InvalidXLogRecPtr;
-
-/*
- * This is set while we are streaming. When not set
- * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
- * the main loop is responsible for checking got_STOPPING and terminating when
- * it's set (after streaming any remaining WAL).
- */
-static volatile sig_atomic_t replication_active = false;
-
-typedef void (*WalSndSendDataCallback) (void);
-static void WalSndLoop(WalSndSendDataCallback send_data);
-static void XLogSendPhysical(void);
-static XLogRecPtr GetStandbyFlushRecPtr(void);
-
-static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-							  TimeLineID *tli_p);
-
-/* END cloned file-level variables and functions from walsender.c */
-
-int
-CompareLsn(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return -1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return 1;
-}
-
-/* Returns a human-readable string corresonding to the SafekeeperState
- *
- * The string should not be freed.
- *
- * The strings are intended to be used as a prefix to "state", e.g.:
- *
- *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
- *
- * If this sort of phrasing doesn't fit the message, instead use something like:
- *
- *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
- */
-char*
-FormatSafekeeperState(SafekeeperState state)
-{
-	char* return_val = NULL;
-
-	switch (state)
-	{
-		case SS_OFFLINE:
-			return_val = "offline";
-			break;
-		case SS_CONNECTING_READ:
-		case SS_CONNECTING_WRITE:
-			return_val = "connecting";
-			break;
-		case SS_WAIT_EXEC_RESULT:
-			return_val = "receiving query result";
-			break;
-		case SS_HANDSHAKE_RECV:
-			return_val = "handshake (receiving)";
-			break;
-		case SS_VOTING:
-			return_val = "voting";
-			break;
-		case SS_WAIT_VERDICT:
-			return_val = "wait-for-verdict";
-			break;
-		case SS_SEND_ELECTED_FLUSH:
-			return_val = "send-announcement-flush";
-			break;
-		case SS_IDLE:
-			return_val = "idle";
-			break;
-		case SS_ACTIVE:
-			return_val = "active";
-			break;
-	}
-
-	Assert(return_val != NULL);
-
-	return return_val;
-}
-
-/* Asserts that the provided events are expected for given safekeeper's state */
-void
-AssertEventsOkForState(uint32 events, Safekeeper* sk)
-{
-	uint32 expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/* The events are in-line with what we're expecting, under two conditions:
-	 *   (a) if we aren't expecting anything, `events` has no read- or
-	 *       write-ready component.
-	 *   (b) if we are expecting something, there's overlap
-	 *       (i.e. `events & expected != 0`)
-	 */
-	bool events_ok_for_state; /* long name so the `Assert` is more clear later */
-
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
-
-	if (!events_ok_for_state)
-	{
-		/* To give a descriptive message in the case of failure, we use elog and
-		 * then an assertion that's guaranteed to fail. */
-		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
-		Assert(events_ok_for_state);
-	}
-}
-
-/* Returns the set of events a safekeeper in this state should be waiting on
- *
- * This will return WL_NO_EVENTS (= 0) for some events. */
-uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
-{
-	uint32 result = WL_NO_EVENTS;
-
-	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
-	{
-		/* Connecting states say what they want in the name */
-		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
-		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
-
-		/* Reading states need the socket to be read-ready to continue */
-		case SS_WAIT_EXEC_RESULT:
-		case SS_HANDSHAKE_RECV:
-		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
-
-		/* Idle states use read-readiness as a sign that the connection has been
-		 * disconnected. */
-		case SS_VOTING:
-		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
-
-		/* 
-		 * Flush states require write-ready for flushing.
-		 * Active state does both reading and writing.
-		 * 
-		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
-		 * 	check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-		 */
-		case SS_SEND_ELECTED_FLUSH:
-		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-
-		/* The offline state expects no events. */
-		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
-
-		default:
-			Assert(false);
-			break;
-	}
-
-	return result;
-}
-
-/* Returns a human-readable string corresponding to the event set
- *
- * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
- * returned string may be meaingless.
- *
- * The string should not be freed. It should also not be expected to remain the same between
- * function calls. */
-char*
-FormatEvents(uint32 events)
-{
-	static char return_str[8];
-
-	/* Helper variable to check if there's extra bits */
-	uint32 all_flags = WL_LATCH_SET
-		| WL_SOCKET_READABLE
-		| WL_SOCKET_WRITEABLE
-		| WL_TIMEOUT
-		| WL_POSTMASTER_DEATH
-		| WL_EXIT_ON_PM_DEATH
-		| WL_SOCKET_CONNECTED;
-
-	/* The formatting here isn't supposed to be *particularly* useful -- it's just to give an
-	 * sense of what events have been triggered without needing to remember your powers of two. */
-
-	return_str[0] = (events & WL_LATCH_SET       ) ? 'L' : '_';
-	return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_';
-	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
-	return_str[3] = (events & WL_TIMEOUT         ) ? 'T' : '_';
-	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
-	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
-	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
-
-	if (events & (~all_flags))
-	{
-		elog(WARNING, "Event formatting found unexpected component %d",
-				events & (~all_flags));
-		return_str[6] = '*';
-		return_str[7] = '\0';
-	}
-	else
-		return_str[6] = '\0';
-
-	return (char *) &return_str;
-}
-
-/*
- * Convert a character which represents a hexadecimal digit to an integer.
- *
- * Returns -1 if the character is not a hexadecimal digit.
- */
-static int
-HexDecodeChar(char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 10;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 10;
-
-	return -1;
-}
-
-/*
- * Decode a hex string into a byte string, 2 hex chars per byte.
- *
- * Returns false if invalid characters are encountered; otherwise true.
- */
-bool
-HexDecodeString(uint8 *result, char *input, int nbytes)
-{
-	int			i;
-
-	for (i = 0; i < nbytes; ++i)
-	{
-		int			n1 = HexDecodeChar(input[i * 2]);
-		int			n2 = HexDecodeChar(input[i * 2 + 1]);
-
-		if (n1 < 0 || n2 < 0)
-			return false;
-		result[i] = n1 * 16 + n2;
-	}
-
-	return true;
-}
-
-/* --------------------------------
- *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint32
-pq_getmsgint32_le(StringInfo msg)
-{
-	uint32		n32;
-
-	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
-
-	return n32;
-}
-
-/* --------------------------------
- *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint64
-pq_getmsgint64_le(StringInfo msg)
-{
-	uint64		n64;
-
-	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
-
-	return n64;
-}
-
-/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint32_le(StringInfo buf, uint32 i)
-{
-	enlargeStringInfo(buf, sizeof(uint32));
-	memcpy(buf->data + buf->len, &i, sizeof(uint32));
-	buf->len += sizeof(uint32);
-}
-
-/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint64_le(StringInfo buf, uint64 i)
-{
-	enlargeStringInfo(buf, sizeof(uint64));
-	memcpy(buf->data + buf->len, &i, sizeof(uint64));
-	buf->len += sizeof(uint64);
-}
-
-/*
- * Write XLOG data to disk.
- */
-void
-XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
-{
-	int			startoff;
-	int			byteswritten;
-
-	while (nbytes > 0)
-	{
-		int			segbytes;
-
-		/* Close the current segment if it's completed */
-		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-			XLogWalPropClose(recptr);
-
-		if (walpropFile < 0)
-		{
-			bool		use_existent = true;
-
-			/* Create/use new log file */
-			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
-			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
-			walpropFileTLI = ThisTimeLineID;
-		}
-
-		/* Calculate the start offset of the received logs */
-		startoff = XLogSegmentOffset(recptr, wal_segment_size);
-
-		if (startoff + nbytes > wal_segment_size)
-			segbytes = wal_segment_size - startoff;
-		else
-			segbytes = nbytes;
-
-		/* OK to write the logs */
-		errno = 0;
-
-		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
-		if (byteswritten <= 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno;
-
-			/* if write didn't set errno, assume no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-
-			save_errno = errno;
-			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-			errno = save_errno;
-			ereport(PANIC,
-					(errcode_for_file_access(),
-					 errmsg("could not write to log segment %s "
-							"at offset %u, length %lu: %m",
-							xlogfname, startoff, (unsigned long) segbytes)));
-		}
-
-		/* Update state for write */
-		recptr += byteswritten;
-
-		nbytes -= byteswritten;
-		buf += byteswritten;
-	}
-
-	/*
-	 * Close the current segment if it's fully written up in the last cycle of
-	 * the loop.
-	 */
-	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-	{
-		XLogWalPropClose(recptr);
-	}
-}
-
-/*
- * Close the current segment.
- */
-void
-XLogWalPropClose(XLogRecPtr recptr)
-{
-	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
-
-	if (close(walpropFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close log segment %s: %m",
-						xlogfname)));
-	}
-
-	walpropFile = -1;
-}
-
-/* START of cloned functions from walsender.c */
-
-/*
- * Handle START_REPLICATION command.
- *
- * At the moment, this never returns, but an ereport(ERROR) will take us back
- * to the main loop.
- */
-void
-StartProposerReplication(StartReplicationCmd *cmd)
-{
-	XLogRecPtr	FlushPtr;
-
-	if (ThisTimeLineID == 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
-
-	/* create xlogreader for physical replication */
-	xlogreader =
-		XLogReaderAllocate(wal_segment_size, NULL,
-						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
-									  .segment_close = wal_segment_close),
-						   NULL);
-
-	if (!xlogreader)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-					errmsg("out of memory")));
-
-	/*
-	 * We assume here that we're logging enough information in the WAL for
-	 * log-shipping, since this is checked in PostmasterMain().
-	 *
-	 * NOTE: wal_level can only change at shutdown, so in most cases it is
-	 * difficult for there to be WAL data that we can still see that was
-	 * written at wal_level='minimal'.
-	 */
-
-	if (cmd->slotname)
-	{
-		ReplicationSlotAcquire(cmd->slotname, true);
-		if (SlotIsLogical(MyReplicationSlot))
-			ereport(ERROR,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-						errmsg("cannot use a logical replication slot for physical replication")));
-
-		/*
-		 * We don't need to verify the slot's restart_lsn here; instead we
-		 * rely on the caller requesting the starting point to use.  If the
-		 * WAL segment doesn't exist, we'll fail later.
-		 */
-	}
-
-	/*
-	 * Select the timeline. If it was given explicitly by the client, use
-	 * that. Otherwise use the timeline of the last replayed record, which is
-	 * kept in ThisTimeLineID.
-	 * 
-	 * Neon doesn't currently use PG Timelines, but it may in the future, so
-	 * we keep this code around to lighten the load for when we need it.
-	 */
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr();
-	}
-	else
-		FlushPtr = GetFlushRecPtr();
-
-	if (cmd->timeline != 0)
-	{
-		XLogRecPtr	switchpoint;
-
-		sendTimeLine = cmd->timeline;
-		if (sendTimeLine == ThisTimeLineID)
-		{
-			sendTimeLineIsHistoric = false;
-			sendTimeLineValidUpto = InvalidXLogRecPtr;
-		}
-		else
-		{
-			List	   *timeLineHistory;
-
-			sendTimeLineIsHistoric = true;
-
-			/*
-			 * Check that the timeline the client requested exists, and the
-			 * requested start location is on that timeline.
-			 */
-			timeLineHistory = readTimeLineHistory(ThisTimeLineID);
-			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
-										 &sendTimeLineNextTLI);
-			list_free_deep(timeLineHistory);
-
-			/*
-			 * Found the requested timeline in the history. Check that
-			 * requested startpoint is on that timeline in our history.
-			 *
-			 * This is quite loose on purpose. We only check that we didn't
-			 * fork off the requested timeline before the switchpoint. We
-			 * don't check that we switched *to* it before the requested
-			 * starting point. This is because the client can legitimately
-			 * request to start replication from the beginning of the WAL
-			 * segment that contains switchpoint, but on the new timeline, so
-			 * that it doesn't end up with a partial segment. If you ask for
-			 * too old a starting point, you'll get an error later when we
-			 * fail to find the requested WAL segment in pg_wal.
-			 *
-			 * XXX: we could be more strict here and only allow a startpoint
-			 * that's older than the switchpoint, if it's still in the same
-			 * WAL segment.
-			 */
-			if (!XLogRecPtrIsInvalid(switchpoint) &&
-				switchpoint < cmd->startpoint)
-			{
-				ereport(ERROR,
-						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
-								LSN_FORMAT_ARGS(cmd->startpoint),
-								cmd->timeline),
-							errdetail("This server's history forked from timeline %u at %X/%X.",
-									  cmd->timeline,
-									  LSN_FORMAT_ARGS(switchpoint))));
-			}
-			sendTimeLineValidUpto = switchpoint;
-		}
-	}
-	else
-	{
-		sendTimeLine = ThisTimeLineID;
-		sendTimeLineValidUpto = InvalidXLogRecPtr;
-		sendTimeLineIsHistoric = false;
-	}
-
-	streamingDoneSending = streamingDoneReceiving = false;
-
-	/* If there is nothing to stream, don't even enter COPY mode */
-	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
-	{
-		/*
-		 * When we first start replication the standby will be behind the
-		 * primary. For some applications, for example synchronous
-		 * replication, it is important to have a clear state for this initial
-		 * catchup mode, so we can trigger actions when we change streaming
-		 * state later. We may stay in this state for a long time, which is
-		 * exactly why we want to be able to monitor whether or not we are
-		 * still here.
-		 */
-		WalSndSetState(WALSNDSTATE_CATCHUP);
-
-		/*
-		 * Don't allow a request to stream from a future point in WAL that
-		 * hasn't been flushed to disk in this server yet.
-		 */
-		if (FlushPtr < cmd->startpoint)
-		{
-			ereport(ERROR,
-					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-							LSN_FORMAT_ARGS(cmd->startpoint),
-							LSN_FORMAT_ARGS(FlushPtr))));
-		}
-
-		/* Start streaming from the requested point */
-		sentPtr = cmd->startpoint;
-
-		/* Initialize shared memory status, too */
-		SpinLockAcquire(&MyWalSnd->mutex);
-		MyWalSnd->sentPtr = sentPtr;
-		SpinLockRelease(&MyWalSnd->mutex);
-
-		SyncRepInitConfig();
-
-		/* Main loop of walsender */
-		replication_active = true;
-
-		WalSndLoop(XLogSendPhysical);
-
-		replication_active = false;
-		if (got_STOPPING)
-			proc_exit(0);
-		WalSndSetState(WALSNDSTATE_STARTUP);
-
-		Assert(streamingDoneSending && streamingDoneReceiving);
-	}
-
-	if (cmd->slotname)
-		ReplicationSlotRelease();
-
-	/*
-	 * Copy is finished now. Send a single-row result set indicating the next
-	 * timeline.
-	 */
-	if (sendTimeLineIsHistoric)
-	{
-		char		startpos_str[8 + 1 + 8 + 1];
-		DestReceiver *dest;
-		TupOutputState *tstate;
-		TupleDesc	tupdesc;
-		Datum		values[2];
-		bool		nulls[2];
-
-		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
-				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
-
-		dest = CreateDestReceiver(DestRemoteSimple);
-		MemSet(nulls, false, sizeof(nulls));
-
-		/*
-		 * Need a tuple descriptor representing two columns. int8 may seem
-		 * like a surprising data type for this, but in theory int4 would not
-		 * be wide enough for this, as TimeLineID is unsigned.
-		 */
-		tupdesc = CreateTemplateTupleDesc(2);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
-								  INT8OID, -1, 0);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
-								  TEXTOID, -1, 0);
-
-		/* prepare for projection of tuple */
-		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
-
-		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
-		values[1] = CStringGetTextDatum(startpos_str);
-
-		/* send it to dest */
-		do_tup_output(tstate, values, nulls);
-
-		end_tup_output(tstate);
-	}
-
-	/* Send CommandComplete message */
-	EndReplicationCommand("START_STREAMING");
-}
-
-/*
- * Returns the latest point in WAL that has been safely flushed to disk, and
- * can be sent to the standby. This should only be called when in recovery,
- * ie. we're streaming to a cascaded standby.
- *
- * As a side-effect, ThisTimeLineID is updated to the TLI of the last
- * replayed WAL record.
- */
-static XLogRecPtr
-GetStandbyFlushRecPtr(void)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	ThisTimeLineID = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-
-/* XLogReaderRoutine->segment_open callback */
-static void
-WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-				  TimeLineID *tli_p)
-{
-	char		path[MAXPGPATH];
-
-	/*-------
-	 * When reading from a historic timeline, and there is a timeline switch
-	 * within this segment, read from the WAL segment belonging to the new
-	 * timeline.
-	 *
-	 * For example, imagine that this server is currently on timeline 5, and
-	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
-	 * 0/13002088. In pg_wal, we have these files:
-	 *
-	 * ...
-	 * 000000040000000000000012
-	 * 000000040000000000000013
-	 * 000000050000000000000013
-	 * 000000050000000000000014
-	 * ...
-	 *
-	 * In this situation, when requested to send the WAL from segment 0x13, on
-	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
-	 * recovery prefers files from newer timelines, so if the segment was
-	 * restored from the archive on this server, the file belonging to the old
-	 * timeline, 000000040000000000000013, might not exist. Their contents are
-	 * equal up to the switchpoint, because at a timeline switch, the used
-	 * portion of the old segment is copied to the new file.  -------
-	 */
-	*tli_p = sendTimeLine;
-	if (sendTimeLineIsHistoric)
-	{
-		XLogSegNo	endSegNo;
-
-		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
-		if (nextSegNo == endSegNo)
-			*tli_p = sendTimeLineNextTLI;
-	}
-
-	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return;
-
-	/*
-	 * If the file is not found, assume it's because the standby asked for a
-	 * too old WAL segment that has already been removed or recycled.
-	 */
-	if (errno == ENOENT)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
-		errno = save_errno;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-					errmsg("requested WAL segment %s has already been removed",
-						   xlogfname)));
-	}
-	else
-		ereport(ERROR,
-				(errcode_for_file_access(),
-					errmsg("could not open file \"%s\": %m",
-						   path)));
-}
-
-
-/* Main loop of walsender process that streams the WAL over Copy messages. */
-static void
-WalSndLoop(WalSndSendDataCallback send_data)
-{
-	/*
-	 * Initialize the last reply timestamp. That enables timeout processing
-	 * from hereon.
-	 */
-	last_reply_timestamp = GetCurrentTimestamp();
-	waiting_for_ping_response = false;
-
-	/*
-	 * Loop until we reach the end of this timeline or the client requests to
-	 * stop streaming.
-	 */
-	for (;;)
-	{
-		/* Clear any already-pending wakeups */
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Process any requests or signals received recently */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-			SyncRepInitConfig();
-		}
-
-		/* always true */
-		if (am_wal_proposer)
-		{
-			send_data();
-			if (WalSndCaughtUp)
-			{
-				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-					WalSndSetState(WALSNDSTATE_STREAMING);
-				WalProposerPoll();
-				WalSndCaughtUp = false;
-			}
-			continue;
-		}
-	}
-}
-
-/*
- * Send out the WAL in its normal physical/stored form.
- *
- * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and buffer it in the libpq output
- * buffer.
- *
- * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
- * otherwise WalSndCaughtUp is set to false.
- */
-static void
-XLogSendPhysical(void)
-{
-	XLogRecPtr	SendRqstPtr;
-	XLogRecPtr	startptr;
-	XLogRecPtr	endptr;
-	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
-
-	/* If requested switch the WAL sender to the stopping state. */
-	if (got_STOPPING)
-		WalSndSetState(WALSNDSTATE_STOPPING);
-
-	if (streamingDoneSending)
-	{
-		WalSndCaughtUp = true;
-		return;
-	}
-
-	/* Figure out how far we can safely send the WAL. */
-	if (sendTimeLineIsHistoric)
-	{
-		/*
-		 * Streaming an old timeline that's in this server's history, but is
-		 * not the one we're currently inserting or replaying. It can be
-		 * streamed up to the point where we switched off that timeline.
-		 */
-		SendRqstPtr = sendTimeLineValidUpto;
-	}
-	else if (am_cascading_walsender)
-	{
-		/*
-		 * Streaming the latest timeline on a standby.
-		 *
-		 * Attempt to send all WAL that has already been replayed, so that we
-		 * know it's valid. If we're receiving WAL through streaming
-		 * replication, it's also OK to send any WAL that has been received
-		 * but not replayed.
-		 *
-		 * The timeline we're recovering from can change, or we can be
-		 * promoted. In either case, the current timeline becomes historic. We
-		 * need to detect that so that we don't try to stream past the point
-		 * where we switched to another timeline. We check for promotion or
-		 * timeline switch after calculating FlushPtr, to avoid a race
-		 * condition: if the timeline becomes historic just after we checked
-		 * that it was still current, it's still be OK to stream it up to the
-		 * FlushPtr that was calculated before it became historic.
-		 */
-		bool		becameHistoric = false;
-
-		SendRqstPtr = GetStandbyFlushRecPtr();
-
-		if (!RecoveryInProgress())
-		{
-			/*
-			 * We have been promoted. RecoveryInProgress() updated
-			 * ThisTimeLineID to the new current timeline.
-			 */
-			am_cascading_walsender = false;
-			becameHistoric = true;
-		}
-		else
-		{
-			/*
-			 * Still a cascading standby. But is the timeline we're sending
-			 * still the one recovery is recovering from? ThisTimeLineID was
-			 * updated by the GetStandbyFlushRecPtr() call above.
-			 */
-			if (sendTimeLine != ThisTimeLineID)
-				becameHistoric = true;
-		}
-
-		if (becameHistoric)
-		{
-			/*
-			 * The timeline we were sending has become historic. Read the
-			 * timeline history file of the new timeline to see where exactly
-			 * we forked off from the timeline we were sending.
-			 */
-			List	   *history;
-
-			history = readTimeLineHistory(ThisTimeLineID);
-			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
-
-			Assert(sendTimeLine < sendTimeLineNextTLI);
-			list_free_deep(history);
-
-			sendTimeLineIsHistoric = true;
-
-			SendRqstPtr = sendTimeLineValidUpto;
-		}
-	}
-	else
-	{
-		/*
-		 * Streaming the current timeline on a primary.
-		 *
-		 * Attempt to send all data that's already been written out and
-		 * fsync'd to disk.  We cannot go further than what's been written out
-		 * given the current implementation of WALRead().  And in any case
-		 * it's unsafe to send WAL that is not securely down to disk on the
-		 * primary: if the primary subsequently crashes and restarts, standbys
-		 * must not have applied any WAL that got lost on the primary.
-		 */
-		SendRqstPtr = GetFlushRecPtr();
-	}
-
-	/*
-	 * Record the current system time as an approximation of the time at which
-	 * this WAL location was written for the purposes of lag tracking.
-	 *
-	 * In theory we could make XLogFlush() record a time in shmem whenever WAL
-	 * is flushed and we could get that time as well as the LSN when we call
-	 * GetFlushRecPtr() above (and likewise for the cascading standby
-	 * equivalent), but rather than putting any new code into the hot WAL path
-	 * it seems good enough to capture the time here.  We should reach this
-	 * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
-	 * may take some time, we read the WAL flush pointer and take the time
-	 * very close to together here so that we'll get a later position if it is
-	 * still moving.
-	 *
-	 * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
-	 * this gives us a cheap approximation for the WAL flush time for this
-	 * LSN.
-	 *
-	 * Note that the LSN is not necessarily the LSN for the data contained in
-	 * the present message; it's the end of the WAL, which might be further
-	 * ahead.  All the lag tracking machinery cares about is finding out when
-	 * that arbitrary LSN is eventually reported as written, flushed and
-	 * applied, so that it can measure the elapsed time.
-	 */
-	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
-
-	/*
-	 * If this is a historic timeline and we've reached the point where we
-	 * forked to the next timeline, stop streaming.
-	 *
-	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
-	 * startup process will normally replay all WAL that has been received
-	 * from the primary, before promoting, but if the WAL streaming is
-	 * terminated at a WAL page boundary, the valid portion of the timeline
-	 * might end in the middle of a WAL record. We might've already sent the
-	 * first half of that partial WAL record to the cascading standby, so that
-	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
-	 * replay the partial WAL record either, so it can still follow our
-	 * timeline switch.
-	 */
-	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
-	{
-		/* close the current file. */
-		if (xlogreader->seg.ws_file >= 0)
-			wal_segment_close(xlogreader);
-
-		/* Send CopyDone */
-		pq_putmessage_noblock('c', NULL, 0);
-		streamingDoneSending = true;
-
-		WalSndCaughtUp = true;
-
-		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
-			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
-			 LSN_FORMAT_ARGS(sentPtr));
-		return;
-	}
-
-	/* Do we have any work to do? */
-	Assert(sentPtr <= SendRqstPtr);
-	if (SendRqstPtr <= sentPtr)
-	{
-		WalSndCaughtUp = true;
-		return;
-	}
-
-	/*
-	 * Figure out how much to send in one message. If there's no more than
-	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
-	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
-	 *
-	 * The rounding is not only for performance reasons. Walreceiver relies on
-	 * the fact that we never split a WAL record across two messages. Since a
-	 * long WAL record is split at page boundary into continuation records,
-	 * page boundary is always a safe cut-off point. We also assume that
-	 * SendRqstPtr never points to the middle of a WAL record.
-	 */
-	startptr = sentPtr;
-	endptr = startptr;
-	endptr += MAX_SEND_SIZE;
-
-	/* if we went beyond SendRqstPtr, back off */
-	if (SendRqstPtr <= endptr)
-	{
-		endptr = SendRqstPtr;
-		if (sendTimeLineIsHistoric)
-			WalSndCaughtUp = false;
-		else
-			WalSndCaughtUp = true;
-	}
-	else
-	{
-		/* round down to page boundary. */
-		endptr -= (endptr % XLOG_BLCKSZ);
-		WalSndCaughtUp = false;
-	}
-
-	nbytes = endptr - startptr;
-	Assert(nbytes <= MAX_SEND_SIZE);
-
-	/* always true */
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, endptr);
-	}
-	else
-	{
-		/* code removed for brevity */
-	}
-	sentPtr = endptr;
-
-	/* Update shared memory status */
-	{
-		WalSnd	   *walsnd = MyWalSnd;
-
-		SpinLockAcquire(&walsnd->mutex);
-		walsnd->sentPtr = sentPtr;
-		SpinLockRelease(&walsnd->mutex);
-	}
-
-	/* Report progress of XLOG streaming in PS display */
-	if (update_process_title)
-	{
-		char		activitymsg[50];
-
-		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
-				 LSN_FORMAT_ARGS(sentPtr));
-		set_ps_display(activitymsg);
-	}
-}
-
diff --git a/contrib/neon/walproposer_utils.h b/contrib/neon/walproposer_utils.h
deleted file mode 100644
index 4771d3ff829..00000000000
--- a/contrib/neon/walproposer_utils.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __NEON_WALPROPOSER_UTILS_H__
-#define __NEON_WALPROPOSER_UTILS_H__
-
-#include "walproposer.h"
-
-int        CompareLsn(const void *a, const void *b);
-char*      FormatSafekeeperState(SafekeeperState state);
-void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
-uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
-char*      FormatEvents(uint32 events);
-bool       HexDecodeString(uint8 *result, char *input, int nbytes);
-uint32     pq_getmsgint32_le(StringInfo msg);
-uint64     pq_getmsgint64_le(StringInfo msg);
-void       pq_sendint32_le(StringInfo buf, uint32 i);
-void       pq_sendint64_le(StringInfo buf, uint64 i);
-void       XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
-void       XLogWalPropClose(XLogRecPtr recptr);
-
-#endif /* __NEON_WALPROPOSER_UTILS_H__ */
diff --git a/contrib/neon_test_utils/Makefile b/contrib/neon_test_utils/Makefile
deleted file mode 100644
index bd618e6d96e..00000000000
--- a/contrib/neon_test_utils/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-# contrib/neon_test_utils/Makefile
-
-
-MODULE_big = neon_test_utils
-OBJS = \
-	$(WIN32RES) \
-	neontest.o
-
-EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.0.sql
-PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
-
-EXTRA_INSTALL=contrib/neon
-
-ifdef USE_PGXS
-PG_CONFIG = pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-else
-PG_CPPFLAGS = -I$(top_srcdir)/contrib
-subdir = contrib/neon_test_utils
-top_builddir = ../..
-include $(top_builddir)/src/Makefile.global
-include $(top_srcdir)/contrib/contrib-global.mk
-endif
diff --git a/contrib/neon_test_utils/neon_test_utils--1.0.sql b/contrib/neon_test_utils/neon_test_utils--1.0.sql
deleted file mode 100644
index 402981a9a66..00000000000
--- a/contrib/neon_test_utils/neon_test_utils--1.0.sql
+++ /dev/null
@@ -1,29 +0,0 @@
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit
-
-CREATE FUNCTION test_consume_xids(nxids int)
-RETURNS VOID
-AS 'MODULE_PATHNAME', 'test_consume_xids'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
-
-CREATE FUNCTION clear_buffer_cache()
-RETURNS VOID
-AS 'MODULE_PATHNAME', 'clear_buffer_cache'
-LANGUAGE C STRICT
-PARALLEL UNSAFE;
-
-CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
-RETURNS bytea
-AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
-LANGUAGE C PARALLEL UNSAFE;
-
-CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
-RETURNS bytea
-AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
-LANGUAGE C PARALLEL UNSAFE;
-
-CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
-RETURNS VOID
-AS 'MODULE_PATHNAME', 'neon_xlogflush'
-LANGUAGE C PARALLEL UNSAFE;
diff --git a/contrib/neon_test_utils/neon_test_utils.control b/contrib/neon_test_utils/neon_test_utils.control
deleted file mode 100644
index 94e67205039..00000000000
--- a/contrib/neon_test_utils/neon_test_utils.control
+++ /dev/null
@@ -1,5 +0,0 @@
-# neon_test_utils extension
-comment = 'helpers for neon testing and debugging'
-default_version = '1.0'
-module_pathname = '$libdir/neon_test_utils'
-relocatable = true
diff --git a/contrib/neon_test_utils/neontest.c b/contrib/neon_test_utils/neontest.c
deleted file mode 100644
index a3e730efe27..00000000000
--- a/contrib/neon_test_utils/neontest.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * neontest.c
- *	  Helpers for neon testing and debugging
- *
- * IDENTIFICATION
- *	 contrib/neon_test_utils/neontest.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "access/relation.h"
-#include "access/xact.h"
-#include "access/xlog.h"
-#include "catalog/namespace.h"
-#include "fmgr.h"
-#include "funcapi.h"
-#include "miscadmin.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "utils/builtins.h"
-#include "utils/pg_lsn.h"
-#include "utils/rel.h"
-#include "utils/varlena.h"
-#include "neon/pagestore_client.h"
-
-PG_MODULE_MAGIC;
-
-extern void _PG_init(void);
-
-PG_FUNCTION_INFO_V1(test_consume_xids);
-PG_FUNCTION_INFO_V1(clear_buffer_cache);
-PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
-PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
-PG_FUNCTION_INFO_V1(neon_xlogflush);
-
-/*
- * Linkage to functions in zenith module.
- * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
- */
-typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
-			XLogRecPtr request_lsn, bool request_latest, char *buffer);
-
-static zenith_read_at_lsn_type zenith_read_at_lsn_ptr;
-
-/*
- * Module initialize function: fetch function pointers for cross-module calls.
- */
-void
-_PG_init(void)
-{
-	/* Asserts verify that typedefs above match original declarations */
-	AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
-	zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
-		load_external_function("$libdir/neon", "zenith_read_at_lsn",
-							   true, NULL);
-}
-
-#define zenith_read_at_lsn zenith_read_at_lsn_ptr
-
-/*
- * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
- */
-Datum
-test_consume_xids(PG_FUNCTION_ARGS)
-{
-	int32		nxids = PG_GETARG_INT32(0);
-	TransactionId topxid;
-	FullTransactionId fullxid;
-	TransactionId xid;
-	TransactionId targetxid;
-
-	/* make sure we have a top-XID first */
-	topxid = GetTopTransactionId();
-
-	xid = ReadNextTransactionId();
-
-	targetxid = xid + nxids;
-	while (targetxid < FirstNormalTransactionId)
-		targetxid++;
-
-	while (TransactionIdPrecedes(xid, targetxid))
-	{
-		fullxid = GetNewTransactionId(true);
-		xid = XidFromFullTransactionId(fullxid);
-		elog(DEBUG1, "topxid: %u xid: %u", topxid, xid);
-	}
-
-	PG_RETURN_VOID();
-}
-
-/*
- * Flush the buffer cache, evicting all pages that are not currently pinned.
- */
-Datum
-clear_buffer_cache(PG_FUNCTION_ARGS)
-{
-	bool		save_zenith_test_evict;
-
-	/*
-	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
-	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
-	 * buffers, as there is no explicit "evict this buffer" function in the
-	 * buffer manager.
-	 */
-	save_zenith_test_evict = zenith_test_evict;
-	zenith_test_evict = true;
-	PG_TRY();
-	{
-		/* Scan through all the buffers */
-		for (int i = 0; i < NBuffers; i++)
-		{
-			BufferDesc *bufHdr;
-			uint32		buf_state;
-			Buffer		bufferid;
-			bool		isvalid;
-			RelFileNode rnode;
-			ForkNumber	forknum;
-			BlockNumber blocknum;
-
-			/* Peek into the buffer header to see what page it holds. */
-			bufHdr = GetBufferDescriptor(i);
-			buf_state = LockBufHdr(bufHdr);
-
-			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-				isvalid = true;
-			else
-				isvalid = false;
-			bufferid = BufferDescriptorGetBuffer(bufHdr);
-			rnode = bufHdr->tag.rnode;
-			forknum = bufHdr->tag.forkNum;
-			blocknum = bufHdr->tag.blockNum;
-
-			UnlockBufHdr(bufHdr, buf_state);
-
-			/*
-			 * Pin the buffer, and release it again. Because we have
-			 * zenith_test_evict==true, this will evict the page from
-			 * the buffer cache if no one else is holding a pin on it.
-			 */
-			if (isvalid)
-			{
-				if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
-					ReleaseBuffer(bufferid);
-			}
-		}
-	}
-	PG_FINALLY();
-	{
-		/* restore the GUC */
-		zenith_test_evict = save_zenith_test_evict;
-	}
-	PG_END_TRY();
-
-	PG_RETURN_VOID();
-}
-
-
-/*
- * Reads the page from page server without buffer cache
- * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN
- * NULL read lsn will result in reading the latest version.
- *
- * Note: reading latest version will result in waiting for latest changes to reach the page server,
- *       if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
- */
-Datum
-get_raw_page_at_lsn(PG_FUNCTION_ARGS)
-{
-	bytea	   *raw_page;
-	ForkNumber	forknum;
-	RangeVar   *relrv;
-	Relation	rel;
-	char	   *raw_page_data;
-	text	   *relname;
-	text	   *forkname;
-	uint32		blkno;
-
-	bool request_latest = PG_ARGISNULL(3);
-	uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
-
-	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
-		PG_RETURN_NULL();
-
-	relname = PG_GETARG_TEXT_PP(0);
-	forkname = PG_GETARG_TEXT_PP(1);
-	blkno = PG_GETARG_UINT32(2);
-
-	if (!superuser())
-		ereport(ERROR,
-				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-				 errmsg("must be superuser to use raw page functions")));
-
-	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
-	rel = relation_openrv(relrv, AccessShareLock);
-
-	/* Check that this relation has storage */
-	if (rel->rd_rel->relkind == RELKIND_VIEW)
-		ereport(ERROR,
-				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("cannot get raw page from view \"%s\"",
-						RelationGetRelationName(rel))));
-	if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
-		ereport(ERROR,
-				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("cannot get raw page from composite type \"%s\"",
-						RelationGetRelationName(rel))));
-	if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
-		ereport(ERROR,
-				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("cannot get raw page from foreign table \"%s\"",
-						RelationGetRelationName(rel))));
-	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-		ereport(ERROR,
-				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("cannot get raw page from partitioned table \"%s\"",
-						RelationGetRelationName(rel))));
-	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-		ereport(ERROR,
-				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("cannot get raw page from partitioned index \"%s\"",
-						RelationGetRelationName(rel))));
-
-	/*
-	 * Reject attempts to read non-local temporary relations; we would be
-	 * likely to get wrong data since we have no visibility into the owning
-	 * session's local buffers.
-	 */
-	if (RELATION_IS_OTHER_TEMP(rel))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot access temporary tables of other sessions")));
-
-
-	forknum = forkname_to_number(text_to_cstring(forkname));
-
-	/* Initialize buffer to copy to */
-	raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
-	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
-	raw_page_data = VARDATA(raw_page);
-
-	zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
-
-	relation_close(rel, AccessShareLock);
-
-	PG_RETURN_BYTEA_P(raw_page);
-}
-
-/*
- * Another option to read a relation page from page server without cache
- * this version doesn't validate input and allows reading blocks of dropped relations
- *
- * Note: reading latest version will result in waiting for latest changes to reach the page server,
- *  if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
- */
-Datum
-get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
-{
-	char	   *raw_page_data;
-
-	if (!superuser())
-		ereport(ERROR,
-				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-				errmsg("must be superuser to use raw page functions")));
-
-	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) ||
-		PG_ARGISNULL(3) || PG_ARGISNULL(4))
-		PG_RETURN_NULL();
-
-	{
-		RelFileNode rnode = {
-			.spcNode = PG_GETARG_OID(0),
-			.dbNode  = PG_GETARG_OID(1),
-			.relNode = PG_GETARG_OID(2)
-		};
-
-		ForkNumber forknum = PG_GETARG_UINT32(3);
-
-		uint32 blkno = PG_GETARG_UINT32(4);
-		bool request_latest = PG_ARGISNULL(5);
-		uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
-
-
-		/* Initialize buffer to copy to */
-		bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
-		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
-		raw_page_data = VARDATA(raw_page);
-
-		zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
-		PG_RETURN_BYTEA_P(raw_page);
-	}
-}
-
-/*
- * Directly calls XLogFlush(lsn) to flush WAL buffers.
- */
-Datum
-neon_xlogflush(PG_FUNCTION_ARGS)
-{
-	XLogRecPtr lsn = PG_GETARG_LSN(0);
-	XLogFlush(lsn);
-	PG_RETURN_VOID();
-}

From aa0499a8d8415aab1cb1727b1f97c2cfdb11fa79 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 26 Aug 2022 13:54:35 +0200
Subject: [PATCH 164/214] Remove Dockerfile, it's now in the neon repo (#199)

---
 Dockerfile | 75 ------------------------------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 77648e21bfd..00000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,75 +0,0 @@
-# Allow specifiyng different compute-tools tag and image repo, so we are
-# able to use different images
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-ARG IMAGE=compute-tools
-ARG TAG=latest
-
-#
-# Image with pre-built tools
-#
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
-# Only to get ready compute_ctl binary as deppendency
-
-#
-# Image with Postgres build deps
-#
-FROM debian:buster-slim AS build-deps
-
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libcurl4-openssl-dev libossp-uuid-dev
-
-#
-# Image with built Postgres
-#
-FROM build-deps AS pg-build
-
-# Add user postgres
-RUN adduser postgres
-RUN mkdir /pg && chown postgres:postgres /pg
-
-# Copy source files
-COPY . /pg/
-
-# Build and install Postgres locally
-RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
-    # Install main binaries and contribs
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/neon install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
-
-USER postgres
-WORKDIR /pg
-
-#
-# Final compute node image to be exported
-#
-FROM debian:buster-slim
-
-# libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
-
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute
-
-# Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
-
-# Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
-
-# XXX: temporary symlink for compatibility with old control-plane
-RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
-
-# Add postgres shared objects to the search path
-RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-USER postgres
-
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]

From a4b65465e4b74b9e8f5ab031dc980d07f0fa5ec8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 2 Sep 2022 22:14:35 +0300
Subject: [PATCH 165/214] Move backpressure throttling implementation to neon
 extension (#203)

* Move backpressure throttling implementation to neon extension and function for monitoring throttling time

* Update src/include/miscadmin.h

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/tcop/postgres.c | 45 ++++++++-----------------------------
 src/include/miscadmin.h     |  4 ++++
 2 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3b15ffab8e7..aebd978bf65 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -181,6 +181,8 @@ static ProcSignalReason RecoveryConflictReason;
 static MemoryContext row_description_context = NULL;
 static StringInfoData row_description_buf;
 
+process_interrupts_callback_t ProcessInterruptsCallback;
+
 /* ----------------------------------------------------------------
  *		decls for routines only used in this file
  * ----------------------------------------------------------------
@@ -3138,14 +3140,15 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
  * return; another interrupt could have arrived.  But we promise that
  * any pre-existing one will have been serviced.)
  */
-static void
-ProcessInterrupts_pg(void)
+void
+ProcessInterrupts(void)
 {
 	/* OK to accept any interrupts now? */
 	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
 		return;
 	InterruptPending = false;
 
+  Retry:
 	if (ProcDiePending)
 	{
 		ProcDiePending = false;
@@ -3379,42 +3382,12 @@ ProcessInterrupts_pg(void)
 
 	if (LogMemoryContextPending)
 		ProcessLogMemoryContextInterrupt();
-}
-
-void
-ProcessInterrupts(void)
-{
-	uint64 lag;
-
-	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
-		return;
 
-	// Don't throttle read only transactions and wal sender
-	if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	/* Call registered callback if any */
+	if (ProcessInterruptsCallback)
 	{
-		ProcessInterrupts_pg();
-		return;
-	}
-
-	#define BACK_PRESSURE_DELAY 10000L // 0.01 sec
-	while(true)
-	{
-		ProcessInterrupts_pg();
-
-		if (delay_backend_us != NULL)
-		{
-			// Suspend writers until replicas catch up
-			lag = delay_backend_us();
-			if (lag <= 0)
-				break;
-
-			set_ps_display("backpressure throttling");
-
-			elog(DEBUG2, "backpressure throttling: lag %lu", lag);
-			pg_usleep(BACK_PRESSURE_DELAY);
-		}
-		else
-			break;
+		if (ProcessInterruptsCallback())
+			goto Retry;
 	}
 }
 
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 72bd0a7ebd4..bfbfe5da8c9 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -106,6 +106,10 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
 /* in tcop/postgres.c */
 extern void ProcessInterrupts(void);
 
+/* Callback called by ProcessInterrupts in the loop while it is returning true. */
+typedef bool (*process_interrupts_callback_t)(void);
+extern process_interrupts_callback_t ProcessInterruptsCallback;
+
 /* Test whether an interrupt is pending */
 #ifndef WIN32
 #define INTERRUPTS_PENDING_CONDITION() \

From 585290ceef6c13c20b4aa10e518a16899acf35bb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 4 Sep 2022 21:37:51 +0300
Subject: [PATCH 166/214] Merge last written cache lsn with new main branch
 (#201)

---
 src/backend/access/gin/gininsert.c       |   3 +-
 src/backend/access/gist/gistbuild.c      |  10 +-
 src/backend/access/spgist/spginsert.c    |   4 +-
 src/backend/access/transam/xlog.c        | 199 +++++++++++++++++++++--
 src/backend/commands/dbcommands.c        |   5 +-
 src/backend/storage/lmgr/lwlocknames.txt |   1 +
 src/backend/utils/misc/guc.c             |  10 ++
 src/include/access/xlog.h                |  13 +-
 8 files changed, 217 insertions(+), 28 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index dfad28d1f61..77af193cfbb 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -421,8 +421,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 55a194a691f..93c3c43586c 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -335,9 +335,11 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd,
+							  index->rd_smgr->smgr_rnode.node,
+							  MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM);
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd);
-
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
@@ -467,7 +469,9 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn);
+		SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rnode.node,
+								  MAIN_FORKNUM, GIST_ROOT_BLKNO);
+		SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rnode.node, MAIN_FORKNUM);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index a7608f4d54c..9773592b703 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -143,8 +143,10 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node,
+						  MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 311a6f54fbc..68b252da0bc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -61,6 +61,7 @@
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/large_object.h"
@@ -113,6 +114,7 @@ int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 bool		track_wal_io_timing = false;
 uint64      predefined_sysidentifier;
+int			lastWrittenLsnCacheSize;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -182,6 +184,26 @@ const struct config_enum_entry recovery_target_action_options[] = {
 	{NULL, 0, false}
 };
 
+
+typedef struct LastWrittenLsnCacheEntry
+{
+	BufferTag	key;
+	XLogRecPtr	lsn;
+	/* double linked list for LRU replacement algorithm */
+	dlist_node	lru_node;
+} LastWrittenLsnCacheEntry;
+
+
+/*
+ * Cache of last written LSN for each relation chunk (hash bucket).
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
+ * relation metadata update.
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
+ * pages are replaced using LRU algorithm, based on L2-list.
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
+ */
+static HTAB *lastWrittenLsnCache;
+
 /*
  * Statistics for current checkpoint are collected in this global struct.
  * Because only the checkpointer or a stand-alone backend can perform
@@ -749,7 +771,17 @@ typedef struct XLogCtlData
 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
-	XLogRecPtr  lastWrittenPageLSN;
+
+	/*
+	 * Maximal last written LSN for pages not present in lastWrittenLsnCache
+	 */
+	XLogRecPtr  maxLastWrittenLsn;
+
+	/*
+	 * Double linked list to implement LRU replacement policy for last written LSN cache.
+	 * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
+	 */
+	dlist_head lastWrittenLsnLRU;
 
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
 	XLogRecPtr	RedoStartLSN;
@@ -772,6 +804,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
  */
 static ControlFileData *ControlFile = NULL;
 
+#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
+
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
  * multiple evaluation!
@@ -5143,11 +5177,8 @@ LocalProcessControlFile(bool reset)
 	ReadControlFile();
 }
 
-/*
- * Initialization of shared memory for XLOG
- */
-Size
-XLOGShmemSize(void)
+static Size
+XLOGCtlShmemSize(void)
 {
 	Size		size;
 
@@ -5187,6 +5218,16 @@ XLOGShmemSize(void)
 	return size;
 }
 
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
+{
+	return XLOGCtlShmemSize() +
+		hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry));
+}
+
 void
 XLOGShmemInit(void)
 {
@@ -5216,6 +5257,15 @@ XLOGShmemInit(void)
 	XLogCtl = (XLogCtlData *)
 		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
 
+	{
+		static HASHCTL info;
+		info.keysize = sizeof(BufferTag);
+		info.entrysize = sizeof(LastWrittenLsnCacheEntry);
+		lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
+											lastWrittenLsnCacheSize, lastWrittenLsnCacheSize,
+											&info,
+											HASH_ELEM | HASH_BLOBS);
+	}
 	localControlFile = ControlFile;
 	ControlFile = (ControlFileData *)
 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
@@ -8119,7 +8169,8 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-	XLogCtl->lastWrittenPageLSN = EndOfLog;
+	XLogCtl->maxLastWrittenLsn = EndOfLog;
+	dlist_init(&XLogCtl->lastWrittenLsnLRU);
 
 	LocalSetXLogInsertAllowed();
 
@@ -8895,29 +8946,141 @@ GetInsertRecPtr(void)
 }
 
 /*
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
+ * It returns an upper bound for the last written LSN of a given page,
+ * either from a cached last written LSN or a global maximum last written LSN.
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
  */
 XLogRecPtr
-GetLastWrittenPageLSN(void)
+GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
 {
 	XLogRecPtr lsn;
-	SpinLockAcquire(&XLogCtl->info_lck);
-	lsn = XLogCtl->lastWrittenPageLSN;
-	SpinLockRelease(&XLogCtl->info_lck);
+	LastWrittenLsnCacheEntry* entry;
+
+	LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
+
+	/* Maximal last written LSN among all non-cached pages */
+	lsn = XLogCtl->maxLastWrittenLsn;
+
+	if (rnode.relNode != InvalidOid)
+	{
+		BufferTag key;
+		key.rnode = rnode;
+		key.forkNum = forknum;
+		key.blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET;
+		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
+		if (entry != NULL)
+			lsn = entry->lsn;
+	}
+	else
+	{
+		HASH_SEQ_STATUS seq;
+		/* Find maximum of all cached LSNs */
+		hash_seq_init(&seq, lastWrittenLsnCache);
+		while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL)
+		{
+			if (entry->lsn > lsn)
+				lsn = entry->lsn;
+		}
+	}
+	LWLockRelease(LastWrittenLsnLock);
 
 	return lsn;
 }
 
 /*
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
+ * We maintain cache of last written LSNs with limited size and LRU replacement
+ * policy. To reduce cache size we store max LSN not for each page, but for
+ * bucket (1024 blocks). This cache allows to use old LSN when
+ * requesting pages of unchanged or appended relations.
+ *
+ * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
+ * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn)
+SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber till)
 {
-	SpinLockAcquire(&XLogCtl->info_lck);
-	if (lsn > XLogCtl->lastWrittenPageLSN)
-		XLogCtl->lastWrittenPageLSN = lsn;
-	SpinLockRelease(&XLogCtl->info_lck);
+	if (lsn == InvalidXLogRecPtr)
+		return;
+
+	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
+	if (rnode.relNode == InvalidOid)
+	{
+		if (lsn > XLogCtl->maxLastWrittenLsn)
+			XLogCtl->maxLastWrittenLsn = lsn;
+	}
+	else
+	{
+		LastWrittenLsnCacheEntry* entry;
+		BufferTag key;
+		bool found;
+		BlockNumber bucket;
+
+		key.rnode = rnode;
+		key.forkNum = forknum;
+		for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
+			 bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET;
+			 bucket++)
+		{
+			key.blockNum = bucket;
+			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
+			if (found)
+			{
+				if (lsn > entry->lsn)
+					entry->lsn = lsn;
+				/* Unlink from LRU list */
+				dlist_delete(&entry->lru_node);
+			}
+			else
+			{
+				entry->lsn = lsn;
+				if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize)
+				{
+					/* Replace least recently used entry */
+					LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&XLogCtl->lastWrittenLsnLRU));
+					/* Adjust max LSN for not cached relations/chunks if needed */
+					if (victim->lsn > XLogCtl->maxLastWrittenLsn)
+						XLogCtl->maxLastWrittenLsn = victim->lsn;
+
+					hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL);
+				}
+			}
+			/* Link to the end of LRU list */
+			dlist_push_tail(&XLogCtl->lastWrittenLsnLRU, &entry->lru_node);
+		}
+	}
+	LWLockRelease(LastWrittenLsnLock);
+}
+
+/*
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
+ */
+void
+SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
+{
+	SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, blkno);
+}
+
+/*
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
+ */
+void
+SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum)
+{
+	SetLastWrittenLSNForBlock(lsn, rnode, forknum, REL_METADATA_PSEUDO_BLOCKNO);
+}
+
+/*
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
+ */
+void
+SetLastWrittenLSNForDatabase(XLogRecPtr lsn)
+{
+	RelFileNode dummyNode = {InvalidOid, InvalidOid, InvalidOid};
+	SetLastWrittenLSNForBlock(lsn, dummyNode, MAIN_FORKNUM, 0);
 }
 
 /*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 97a58988fb0..943cd8a696d 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -685,7 +685,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn);
+				SetLastWrittenLSNForDatabase(lsn);
 			}
 		}
 		table_endscan(scan);
@@ -2363,8 +2363,7 @@ dbase_redo(XLogReaderState *record)
 		 */
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
-
-			SetLastWrittenPageLSN(lsn);
+			SetLastWrittenLSNForDatabase(lsn);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c2956..b4652c33ff6 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+LastWrittenLsnLock					48
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 218ab8bf673..db52b7994d4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2357,6 +2357,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"lsn_cache_size", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Size of las written LSN cache used by Neon."),
+			NULL
+		},
+		&lastWrittenLsnCacheSize,
+		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 66fe9dfcd9e..182f6c6449f 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -31,6 +31,11 @@ extern int	sync_method;
 
 extern PGDLLIMPORT TimeLineID ThisTimeLineID;	/* current TLI */
 
+/*
+ * Pseudo block number used to associate LSN with relation metadata (relation size)
+ */
+#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
+
 /*
  * Prior to 8.4, all activity during recovery was carried out by the startup
  * process. This local variable continues to be used in many parts of the
@@ -132,6 +137,7 @@ extern char *PrimaryConnInfo;
 extern char *PrimarySlotName;
 extern bool wal_receiver_create_temp_slot;
 extern bool track_wal_io_timing;
+extern int  lastWrittenLsnCacheSize;
 
 /* indirectly set via GUC system */
 extern TransactionId recoveryTargetXid;
@@ -351,8 +357,11 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
-extern XLogRecPtr GetLastWrittenPageLSN(void);
+extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno);
+extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber till);
+extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn);
+extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum);
+extern XLogRecPtr GetLastWrittenLSN(RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From 74df0adc9d09b9709a0a247e71fdb5d0cd5903ce Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 12 Sep 2022 17:25:07 +0300
Subject: [PATCH 167/214] Local prefetch implementation for Postgres 14

---
 src/backend/access/heap/heapam.c       | 17 +++++++++++++++++
 src/backend/access/heap/vacuumlazy.c   | 17 +++++++++++++++++
 src/backend/optimizer/path/costsize.c  |  2 ++
 src/backend/storage/smgr/md.c          |  9 +++++++++
 src/backend/storage/smgr/smgr.c        | 10 ++++++++++
 src/backend/utils/misc/guc.c           | 20 ++++++++++++++++++++
 src/include/optimizer/cost.h           |  2 ++
 src/include/storage/md.h               |  1 +
 src/include/storage/smgr.h             |  2 ++
 src/test/regress/expected/sysviews.out |  3 ++-
 10 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 116ed825c83..be16700323b 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -53,6 +53,7 @@
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "port/pg_bitutils.h"
@@ -398,6 +399,22 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 	 */
 	CHECK_FOR_INTERRUPTS();
 
+	/* Prefetch next block */
+	if (enable_seqscan_prefetch)
+	{
+		int prefetch_limit = seqscan_prefetch_buffers;
+		ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata;
+		if (pbscanwork != NULL && pbscanwork->phsw_chunk_remaining < prefetch_limit)
+			prefetch_limit = pbscanwork->phsw_chunk_remaining;
+		if (page + prefetch_limit >= scan->rs_nblocks)
+			prefetch_limit = scan->rs_nblocks - page - 1;
+
+		RelationOpenSmgr(scan->rs_base.rs_rd);
+		smgr_reset_prefetch(scan->rs_base.rs_rd->rd_smgr);
+		for (int i = 1; i <= prefetch_limit; i++)
+			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, page+i);
+	}
+
 	/* read page using selected strategy */
 	scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
 									   RBM_NORMAL, scan->rs_strategy);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index c684c4fbee3..cc762d80dad 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -68,6 +68,7 @@
 #include "commands/vacuum.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "optimizer/paths.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
@@ -1217,6 +1218,14 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		 */
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
+		if (enable_seqscan_prefetch)
+		{
+			int prefetch_limit = Min(nblocks - blkno - 1, seqscan_prefetch_buffers);
+			RelationOpenSmgr(vacrel->rel);
+			smgr_reset_prefetch(vacrel->rel->rd_smgr);
+			for (int i = 1; i <= prefetch_limit; i++)
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, blkno+i);
+		}
 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
 								 RBM_NORMAL, vacrel->bstrategy);
 
@@ -2350,6 +2359,14 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		vacuum_delay_point();
 
 		tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
+		if (enable_seqscan_prefetch)
+		{
+			int prefetch_limit = Min(vacrel->dead_tuples->num_tuples - tupindex - 1, seqscan_prefetch_buffers);
+			RelationOpenSmgr(vacrel->rel);
+			smgr_reset_prefetch(vacrel->rel->rd_smgr);
+			for (int i = 1; i <= prefetch_limit; i++)
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex + i]));
+		}
 		vacrel->blkno = tblk;
 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
 								 vacrel->bstrategy);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 006f91f0a87..c0c2d4a9aa8 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -150,6 +150,8 @@ bool		enable_parallel_append = true;
 bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
 bool		enable_async_append = true;
+bool        enable_seqscan_prefetch = true;
+int         seqscan_prefetch_buffers = 0;
 
 typedef struct
 {
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 0d834c69ac5..c7f2c647851 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -577,6 +577,15 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	return true;
 }
 
+
+/*
+ *	mdprefetch() -- Cancel all previous prefetch requests
+ */
+void
+md_reset_prefetch(SMgrRelation reln)
+{
+}
+
 /*
  * mdwriteback() -- Tell the kernel to write pages back to storage.
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index f1e676bcc3e..c44a27bafbf 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -38,6 +38,7 @@ static const f_smgr smgr_md = {
 		.smgr_unlink = mdunlink,
 		.smgr_extend = mdextend,
 		.smgr_prefetch = mdprefetch,
+		.smgr_reset_prefetch = md_reset_prefetch,
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
 		.smgr_writeback = mdwriteback,
@@ -500,6 +501,15 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum);
 }
 
+/*
+ *	smgr_reset_prefetch() -- Cancel all previos prefetch requests
+ */
+void
+smgr_reset_prefetch(SMgrRelation reln)
+{
+	(*reln->smgr).smgr_reset_prefetch(reln);
+}
+
 /*
  *	smgrread() -- read a particular block from a relation into the supplied
  *				  buffer.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index db52b7994d4..4c399b58b0a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -957,6 +957,16 @@ static const unit_conversion time_unit_conversion_table[] =
 
 static struct config_bool ConfigureNamesBool[] =
 {
+	{
+		{"enable_seqscan_prefetch", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the sequence scan next page prefetching."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_seqscan_prefetch,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of sequential-scan plans."),
@@ -2145,6 +2155,16 @@ static struct config_bool ConfigureNamesBool[] =
 
 static struct config_int ConfigureNamesInt[] =
 {
+	{
+		{"seqscan_prefetch_buffers", PGC_USERSET, QUERY_TUNING_METHOD,
+		 gettext_noop("Number of subsequent buffer to be prefetched during sequential scan."),
+		 NULL,
+		 GUC_EXPLAIN
+		},
+		&seqscan_prefetch_buffers,
+		8, 0, 1000,
+		NULL, NULL, NULL
+	},
 	{
 		{"archive_timeout", PGC_SIGHUP, WAL_ARCHIVING,
 			gettext_noop("Forces a switch to the next WAL file if a "
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 2113bc82de0..155d24cc2ac 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -67,6 +67,8 @@ extern PGDLLIMPORT bool enable_parallel_append;
 extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
 extern PGDLLIMPORT bool enable_async_append;
+extern PGDLLIMPORT bool enable_seqscan_prefetch;
+extern PGDLLIMPORT int seqscan_prefetch_buffers;
 extern PGDLLIMPORT int constraint_exclusion;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 752b440864d..84868bdde96 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -30,6 +30,7 @@ extern void mdextend(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum, char *buffer, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum);
+extern void md_reset_prefetch(SMgrRelation reln);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				   char *buffer);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 5fcb2060007..3305897bcd5 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -119,6 +119,7 @@ typedef struct f_smgr
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
 
+	void		(*smgr_reset_prefetch) (SMgrRelation reln);
 	void		(*smgr_start_unlogged_build) (SMgrRelation reln);
 	void		(*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln);
 	void		(*smgr_end_unlogged_build) (SMgrRelation reln);
@@ -156,6 +157,7 @@ extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
+extern void smgr_reset_prefetch(SMgrRelation reln);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum, char *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 2088857615a..4453fcaa517 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -120,9 +120,10 @@ select name, setting from pg_settings where name like 'enable%';
  enable_partitionwise_aggregate | off
  enable_partitionwise_join      | off
  enable_seqscan                 | on
+ enable_seqscan_prefetch        | on
  enable_sort                    | on
  enable_tidscan                 | on
-(20 rows)
+(21 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From dab5dff370ac5a1aeddd0522dfbc23ddae1b8f52 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 12 Sep 2022 17:26:28 +0300
Subject: [PATCH 168/214] Disable prefetch by default to make it possible to
 early merge this PR to main

---
 src/backend/utils/misc/guc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 4c399b58b0a..fc282dcda24 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -964,7 +964,8 @@ static struct config_bool ConfigureNamesBool[] =
 			GUC_EXPLAIN
 		},
 		&enable_seqscan_prefetch,
-		true,
+		false /* temporary disable to be able to merge in main */
+		/* true, */
 		NULL, NULL, NULL
 	},
 	{
@@ -2162,7 +2163,8 @@ static struct config_int ConfigureNamesInt[] =
 		 GUC_EXPLAIN
 		},
 		&seqscan_prefetch_buffers,
-		8, 0, 1000,
+		/* 8, 0, 1000, */
+		0, 0, 1000, /* temporary disable to be able to merge in main */
 		NULL, NULL, NULL
 	},
 	{

From 62cbaef441bd98d9971f39cf10b0e773ea16dabd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 12 Sep 2022 17:35:56 +0300
Subject: [PATCH 169/214] Disable prefetch by default to make it possible to
 early merge this PR to main

---
 src/backend/utils/misc/guc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index fc282dcda24..6f1bd459f0d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -964,7 +964,7 @@ static struct config_bool ConfigureNamesBool[] =
 			GUC_EXPLAIN
 		},
 		&enable_seqscan_prefetch,
-		false /* temporary disable to be able to merge in main */
+		false, /* temporary disable to be able to merge in main */
 		/* true, */
 		NULL, NULL, NULL
 	},

From b61bf54b62c7dbab6b9fb4b0b2872c9f7e8d2666 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 12 Sep 2022 21:44:48 +0300
Subject: [PATCH 170/214] Update expected output for sysviews test because of
 changed default value of enable_seqscan_prefetch

---
 src/test/regress/expected/sysviews.out | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 4453fcaa517..89aebb5e10b 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -120,7 +120,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_partitionwise_aggregate | off
  enable_partitionwise_join      | off
  enable_seqscan                 | on
- enable_seqscan_prefetch        | on
+ enable_seqscan_prefetch        | off
  enable_sort                    | on
  enable_tidscan                 | on
 (21 rows)

From 96f340ed2cfdacef5298859cddaa82ab249281dc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 16 Sep 2022 11:43:50 +0300
Subject: [PATCH 171/214] Set last written page lsn for created relation (#209)

---
 src/backend/catalog/storage.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index d40183aff74..0eb6c00ee47 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -175,6 +175,7 @@ void
 log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
 {
 	xl_smgr_create xlrec;
+	XLogRecPtr lsn;
 
 	/*
 	 * Make an XLOG entry reporting the file creation.
@@ -184,7 +185,8 @@ log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
 
 	XLogBeginInsert();
 	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
-	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
+	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
+	SetLastWrittenLSNForRelation(lsn, *rnode, forkNum);
 }
 
 /*

From 68d58596dc20e5be20b29f568518dc1321352e8b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 20 Sep 2022 10:48:58 +0300
Subject: [PATCH 172/214] Undo diasming VM check warning in vacuumlazy.c (#213)

---
 src/backend/access/heap/vacuumlazy.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index cc762d80dad..99c6a3298c1 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1496,10 +1496,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
 				 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
 		{
-			/* ZENITH-XXX: all visible hint is not wal-logged
-			 * FIXME: Replay visibilitymap changes in pageserver
-			 */
-			elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
 				 vacrel->relname, blkno);
 			visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
 								VISIBILITYMAP_VALID_BITS);

From e5462747af01c9d7a7ce85c483f86cdd75c8ed27 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Oct 2022 20:01:14 +0300
Subject: [PATCH 173/214] Pin pages with speculative insert tuples to prevent
 their reconstruction because spec_token is not wal logged (#221)

* Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged

refer #2587

* Undo Neon trick in heap_xlog_insert which is not needed any more after pinning page for speulative insert

* Update src/backend/access/heap/heapam.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>

* Move ReleaseBuffer to the end of heap_finish_speculative function

* Update src/backend/access/heap/heapam.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/access/heap/heapam.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index be16700323b..249107cef4a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2258,7 +2258,18 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 
 	END_CRIT_SECTION();
 
-	UnlockReleaseBuffer(buffer);
+	if (options & HEAP_INSERT_SPECULATIVE)
+	{
+		/*
+		 * NEON: speculative token is not stored in WAL, so if the page is evicted
+		 * from the buffer cache, the token will be lost. To prevent that, we keep the
+		 * buffer pinned. It will be unpinned in heapam_tuple_finish/abort_speculative.
+		 */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	}
+	else
+		UnlockReleaseBuffer(buffer);
+
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
 
@@ -5930,6 +5941,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid)
 
 	END_CRIT_SECTION();
 
+	ReleaseBuffer(buffer); /* NEON: release buffer pinned by heap_insert */
 	UnlockReleaseBuffer(buffer);
 }
 
@@ -6002,6 +6014,16 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
 		elog(ERROR, "attempted to kill a non-speculative tuple");
 	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
 
+    /*
+     * NEON: release buffer pinned by heap_insert
+     *
+     * This function is also used on the toast tuples of an aborted speculative
+     * insertion. For those, there is no token on the tuple, and we didn' t keep
+     * the pin. 
+      */
+	if (HeapTupleHeaderIsSpeculative(tp.t_data))
+		ReleaseBuffer(buffer);  
+
 	/*
 	 * No need to check for serializable conflicts here.  There is never a
 	 * need for a combo CID, either.  No need to extract replica identity, or
@@ -9003,7 +9025,7 @@ heap_xlog_insert(XLogReaderState *record)
 
 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
 	ItemPointerSetBlockNumber(&target_tid, blkno);
-	ItemPointerSetOffsetNumber(&target_tid, (xlrec->flags & XLH_INSERT_IS_SPECULATIVE) ? SpecTokenOffsetNumber : xlrec->offnum);
+	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
 
 	/*
 	 * The visibility map may need to be fixed even if the heap page is

From edd2e65dcb227ac732e7d46d4674a79910f330a0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 24 Oct 2022 12:13:46 +0300
Subject: [PATCH 174/214] Fix shared memory initialization for last written LSN
 cache (#224)

* Fix shared memory initialization for last written LSN cache

Replace (from,till) with (from,n_blocks) for SetLastWrittenLSNForBlockRange function

* Fast exit from SetLastWrittenLSNForBlockRange for n_blocks == 0
---
 src/backend/access/transam/xlog.c | 17 ++++++++++-------
 src/include/access/xlog.h         |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 68b252da0bc..f4ed08eb66a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5255,7 +5255,7 @@ XLOGShmemInit(void)
 
 
 	XLogCtl = (XLogCtlData *)
-		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
+		ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog);
 
 	{
 		static HASHCTL info;
@@ -9001,9 +9001,9 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
  * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
  */
 void
-SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber till)
+SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
 {
-	if (lsn == InvalidXLogRecPtr)
+	if (lsn == InvalidXLogRecPtr || n_blocks == 0)
 		return;
 
 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
@@ -9018,12 +9018,15 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for
 		BufferTag key;
 		bool found;
 		BlockNumber bucket;
+		BlockNumber start_bucket; /* inclusive */
+		BlockNumber end_bucket;   /* exclusive */
+
+		start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
+		end_bucket = (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET;
 
 		key.rnode = rnode;
 		key.forkNum = forknum;
-		for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
-			 bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET;
-			 bucket++)
+		for (bucket = start_bucket; bucket < end_bucket; bucket++)
 		{
 			key.blockNum = bucket;
 			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
@@ -9061,7 +9064,7 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for
 void
 SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
 {
-	SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, blkno);
+	SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, 1);
 }
 
 /*
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 182f6c6449f..b784cbf0493 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -358,7 +358,7 @@ extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
 extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno);
-extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber till);
+extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks);
 extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn);
 extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum);
 extern XLogRecPtr GetLastWrittenLSN(RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno);

From a270df9b342fa11910ba878cb732a7c3d79968b8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 28 Oct 2022 10:00:05 +0300
Subject: [PATCH 175/214] Fix upper boundary caculation in the chunks loop in
 SetLastWrittenLSNForBlockRange (#230)

---
 src/backend/access/transam/xlog.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f4ed08eb66a..144bf0e3432 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9022,7 +9022,8 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for
 		BlockNumber end_bucket;   /* exclusive */
 
 		start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
-		end_bucket = (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET;
+		end_bucket = from == REL_METADATA_PSEUDO_BLOCKNO
+			? start_bucket + 1 : (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET;
 
 		key.rnode = rnode;
 		key.forkNum = forknum;

From 9c10b7a9d6c23b2e62019b3ea26ee348506e22a3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 31 Oct 2022 01:11:43 +0100
Subject: [PATCH 176/214] Move walredo process code under pgxn in the main
 'neon' repository.

- Refactor the way the WalProposerMain function is called when started
  with --sync-safekeepers. The postgres binary now explicitly loads
  the 'neon.so' library and calls the WalProposerMain in it. This is
  simpler than the global function callback "hook" we previously used.

- Move the WAL redo process code to a new library, neon_walredo.so,
  and use the same mechanism as for --sync-safekeepers to call the
  WalRedoMain function, when launched with --walredo argument.

- Also move the seccomp code to neon_walredo.so library. I kept the
  configure check in the postgres side for now, though.
---
 src/backend/main/main.c                 |  43 +-
 src/backend/postmaster/Makefile         |   5 -
 src/backend/postmaster/bgworker.c       |   1 -
 src/backend/postmaster/postmaster.c     |   1 -
 src/backend/postmaster/seccomp.c        | 249 --------
 src/backend/replication/Makefile        |   3 +-
 src/backend/replication/walpropcompat.c |  96 ---
 src/backend/replication/walsender.c     |   1 -
 src/backend/storage/buffer/buf_init.c   |   8 +
 src/backend/tcop/Makefile               |   2 -
 src/backend/tcop/zenith_wal_redo.c      | 816 ------------------------
 src/backend/utils/misc/guc.c            |   1 -
 src/include/miscadmin.h                 |   2 +-
 src/include/postmaster/seccomp.h        |  26 -
 src/include/replication/walpropshim.h   |  19 -
 src/include/storage/buf_internals.h     |   2 +
 src/include/tcop/tcopprot.h             |   4 -
 17 files changed, 50 insertions(+), 1229 deletions(-)
 delete mode 100644 src/backend/postmaster/seccomp.c
 delete mode 100644 src/backend/replication/walpropcompat.c
 delete mode 100644 src/backend/tcop/zenith_wal_redo.c
 delete mode 100644 src/include/postmaster/seccomp.h
 delete mode 100644 src/include/replication/walpropshim.h

diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index b63d7bfb1c7..1d93111bd03 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -33,9 +33,9 @@
 
 #include "bootstrap/bootstrap.h"
 #include "common/username.h"
+#include "miscadmin.h"
 #include "port/atomics.h"
 #include "postmaster/postmaster.h"
-#include "replication/walpropshim.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/help_config.h"
@@ -52,6 +52,41 @@ static void init_locale(const char *categoryname, int category, const char *loca
 static void help(const char *progname);
 static void check_root(const char *progname);
 
+typedef int (*MainFunc) (int argc, char *argv[]);
+
+static int
+CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[])
+{
+	MainFunc main_func;
+
+	/*
+	 * Perform just enough initialization that we can load external libraries
+	 */
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * neon extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	main_func = load_external_function(library_name, main_func_name, true, NULL);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	return main_func(argc, argv);
+}
 
 /*
  * Any Postgres server process begins execution here.
@@ -207,11 +242,9 @@ main(int argc, char *argv[])
 					 NULL,		/* no dbname */
 					 strdup(get_user_name_or_exit(progname)));	/* does not return */
 	else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0)
-		WalRedoMain(argc, argv,
-					 NULL,		/* no dbname */
-					 strdup(get_user_name_or_exit(progname)));	/* does not return */
+		CallExtMain("neon_walredo", "WalRedoMain", argc, argv);
 	else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0)
-		WalProposerSync(argc, argv);
+		CallExtMain("neon", "WalProposerSync", argc, argv);
 	else
 		PostmasterMain(argc, argv); /* does not return */
 	abort();					/* should not get here */
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 926ee077111..bfdf6a833db 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -26,9 +26,4 @@ OBJS = \
 	syslogger.o \
 	walwriter.o
 
-ifeq ($(with_libseccomp),yes)
-OBJS += \
-	seccomp.o
-endif
-
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 540a8454da2..c40410d73ea 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -22,7 +22,6 @@
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
-#include "replication/walpropshim.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 855b1de5de7..0a4533251ba 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -117,7 +117,6 @@
 #include "postmaster/syslogger.h"
 #include "replication/logicallauncher.h"
 #include "replication/walsender.h"
-#include "replication/walpropshim.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
deleted file mode 100644
index 3ac21b02983..00000000000
--- a/src/backend/postmaster/seccomp.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * seccomp.c
- *	  Secure Computing BPF API wrapper.
- *
- * Pageserver delegates complex WAL decoding duties to postgres,
- * which means that the latter might fall victim to carefully designed
- * malicious WAL records and start doing harmful things to the system.
- * To prevent this, it has been decided to limit possible interactions
- * with the outside world using the Secure Computing BPF mode.
- *
- * We use this mode to disable all syscalls not in the allowlist. This
- * approach has its pros & cons:
- *
- *  - We have to carefully handpick and maintain the set of syscalls
- *    required for the WAL redo process. Core dumps help with that.
- *    The method of trial and error seems to work reasonably well,
- *    but it would be nice to find a proper way to "prove" that
- *    the set in question is both necessary and sufficient.
- *
- *  - Once we enter the seccomp bpf mode, it's impossible to lift those
- *    restrictions (otherwise, what kind of "protection" would that be?).
- *    Thus, we have to either enable extra syscalls for the clean shutdown,
- *    or exit the process immediately via _exit() instead of proc_exit().
- *
- *  - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom
- *    facility to deal with the forbidden syscalls? If we'd like to embed
- *    a startup security test, we should go with the latter; In that
- *    case, which one of the following options is preferable?
- *
- *      * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP.
- *        Provide a common signal handler with a static switch to override
- *        its behavior for the test case. This would undermine the whole
- *        purpose of such protection, so we'd have to go further and remap
- *        the memory backing the switch as readonly, then ban mprotect().
- *        Ugly and fragile, to say the least.
- *
- *      * Yet again, catch the denied syscalls using SCMP_ACT_TRAP.
- *        Provide 2 different signal handlers: one for a test case,
- *        another for the main processing loop. Install the first one,
- *        enable seccomp, perform the test, switch to the second one,
- *        finally ban sigaction(), presto!
- *
- *      * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the
- *        test, then ban it altogether with another filter. The downside
- *        of this solution is that we don't actually check that
- *        SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works.
- *
- *    Either approach seems to require two eBPF filter programs,
- *    which is unfortunate: the man page tells this is uncommon.
- *    Maybe I (@funbringer) am missing something, though; I encourage
- *    any reader to get familiar with it and scrutinize my conclusions.
- *
- * TODOs and ideas in no particular order:
- *
- *  - Do something about mmap() in musl's malloc().
- *    Definitely not a priority if we don't care about musl.
- *
- *  - See if we can untangle PG's shutdown sequence (involving unlink()):
- *
- *      * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode.
- *      * Investigate chroot() or mount namespaces for better FS isolation.
- *      * (Per Heikki) Simply call _exit(), no big deal.
- *      * Come up with a better idea?
- *
- *  - Make use of seccomp's argument inspection (for what?).
- *    Unfortunately, it views all syscall arguments as scalars,
- *    so it won't work for e.g. string comparison in unlink().
- *
- *  - Benchmark with bpf jit on/off, try seccomp_syscall_priority().
- *
- *  - Test against various linux distros & glibc versions.
- *    I suspect that certain libc functions might involve slightly
- *    different syscalls, e.g. select/pselect6/pselect6_time64/whatever.
- *
- *  - Test on any arch other than amd64 to see if it works there.
- *
- *
- * IDENTIFICATION
- *	  src/backend/postmaster/seccomp.c
- *
- *-------------------------------------------------------------------------
- */
-
-#include "postgres.h"
-#include "miscadmin.h"
-#include "postmaster/seccomp.h"
-
-#include <fcntl.h>
-#include <unistd.h>
-
-static void die(int code, const char *str);
-
-static bool seccomp_test_sighandler_done = false;
-static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt);
-static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt);
-
-static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action);
-
-void seccomp_load_rules(PgSeccompRule *rules, int count)
-{
-	struct sigaction action = { .sa_flags = SA_SIGINFO };
-	PgSeccompRule rule;
-	long fd;
-
-	/*
-	 * Install a test signal handler.
-	 * XXX: pqsignal() is too restrictive for our purposes,
-	 * since we'd like to examine the contents of siginfo_t.
-	 */
-	action.sa_sigaction = seccomp_test_sighandler;
-	if (sigaction(SIGSYS, &action, NULL) != 0)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: could not install test SIGSYS handler")));
-
-	/*
-	 * First, check that open of a well-known file works.
-	 * XXX: We use raw syscall() to call the very open().
-	 */
-	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
-	if (seccomp_test_sighandler_done)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: signal handler test flag was set unexpectedly")));
-	if (fd < 0)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: could not open /dev/null for seccomp testing: %m")));
-	close((int) fd);
-
-	/* Set a trap on open() to test seccomp bpf */
-	rule = PG_SCMP(open, SCMP_ACT_TRAP);
-	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: could not load test trap")));
-
-	/* Finally, check that open() now raises SIGSYS */
-	(void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
-	if (!seccomp_test_sighandler_done)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: SIGSYS handler doesn't seem to work")));
-
-	/* Now that everything seems to work, install a proper handler */
-	action.sa_sigaction = seccomp_deny_sighandler;
-	if (sigaction(SIGSYS, &action, NULL) != 0)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: could not install SIGSYS handler")));
-
-	/* If this succeeds, any syscall not in the list will crash the process */
-	if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
-		ereport(FATAL,
-				(errcode(ERRCODE_SYSTEM_ERROR),
-				 errmsg("seccomp: could not enter seccomp mode")));
-}
-
-/*
- * Enter seccomp mode with a BPF filter that will only allow
- * certain syscalls to proceed.
- */
-static int
-do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
-{
-	scmp_filter_ctx ctx;
-	int rc = -1;
-
-	/* Create a context with a default action for syscalls not in the list */
-	if ((ctx = seccomp_init(def_action)) == NULL)
-		goto cleanup;
-
-	for (int i = 0; i < count; i++)
-	{
-		PgSeccompRule *rule = &rules[i];
-		if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0)
-			goto cleanup;
-	}
-
-	/* Try building & loading the program into the kernel */
-	if ((rc = seccomp_load(ctx)) != 0)
-		goto cleanup;
-
-cleanup:
-	/*
-	 * We don't need the context anymore regardless of the result,
-	 * since either we failed or the eBPF program has already been
-	 * loaded into the linux kernel.
-	 */
-	seccomp_release(ctx);
-	return rc;
-}
-
-static void
-die(int code, const char *str)
-{
-	/* work around gcc ignoring that it shouldn't warn on (void) result being unused */
-	ssize_t _unused pg_attribute_unused();
-	/* Best effort write to stderr */
-	_unused = write(fileno(stderr), str, strlen(str));
-
-	/* XXX: we don't want to run any atexit callbacks */
-	_exit(code);
-}
-
-static void
-seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
-{
-#define DIE_PREFIX "seccomp test signal handler: "
-
-	/* Check that this signal handler is used only for a single test case */
-	if (seccomp_test_sighandler_done)
-		die(1, DIE_PREFIX "test handler should only be used for 1 test\n");
-	seccomp_test_sighandler_done = true;
-
-	if (signum != SIGSYS)
-		die(1, DIE_PREFIX "bad signal number\n");
-
-	/* TODO: maybe somehow extract the hardcoded syscall number */
-	if (info->si_syscall != SCMP_SYS(open))
-		die(1, DIE_PREFIX "bad syscall number\n");
-
-#undef DIE_PREFIX
-}
-
-static void
-seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
-{
-	/*
-	 * Unfortunately, we can't use seccomp_syscall_resolve_num_arch()
-	 * to resolve the syscall's name, since it calls strdup()
-	 * under the hood (wtf!).
-	 */
-	char buffer[128];
-	(void)snprintf(buffer, lengthof(buffer),
-			"---------------------------------------\n"
-			"seccomp: bad syscall %d\n"
-			"---------------------------------------\n",
-			info->si_syscall);
-
-	/*
-	 * Instead of silently crashing the process with
-	 * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS,
-	 * we'd like to receive a real SIGSYS to print the
-	 * message and *then* immediately exit.
-	 */
-	die(1, buffer);
-}
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 4b7c3d32a4d..a0381e52f31 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -24,8 +24,7 @@ OBJS = \
 	syncrep_gram.o \
 	walreceiver.o \
 	walreceiverfuncs.o \
-	walsender.o \
-	walpropcompat.o
+	walsender.o
 
 SUBDIRS = logical
 
diff --git a/src/backend/replication/walpropcompat.c b/src/backend/replication/walpropcompat.c
deleted file mode 100644
index 8caf2460795..00000000000
--- a/src/backend/replication/walpropcompat.c
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "postgres.h"
-
-#include <signal.h>
-#include <unistd.h>
-#include <sys/stat.h>
-
-#include "access/xlog.h"
-#include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
-#include "miscadmin.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/postmaster.h"
-#include "storage/fd.h"
-#include "utils/guc.h"
-#include "replication/walpropshim.h"
-
-bool syncSafekeepers = false;
-void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId) = NULL;
-void (*WalProposerStart) (void) = NULL;
-
-/*
- * Entry point for `postgres --sync-safekeepers`.
- */
-void
-WalProposerSync(int argc, char *argv[])
-{
-	struct stat stat_buf;
-
-	syncSafekeepers = true;
-	ThisTimeLineID = 1;
-
-	InitStandaloneProcess(argv[0]);
-
-	SetProcessingMode(InitProcessing);
-
-	/*
-	 * Set default values for command-line options.
-	 */
-	InitializeGUCOptions();
-
-	/* Acquire configuration parameters */
-	if (!SelectConfigFiles(NULL, progname))
-		exit(1);
-
-	/*
-	 * Imitate we are early in bootstrap loading shared_preload_libraries;
-	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
-	 */
-	process_shared_preload_libraries_in_progress = true;
-
-	/*
-	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
-	 *
-	 * Copied from InitPostmasterDeathWatchHandle()
-	 */
-	if (pipe(postmaster_alive_fds) < 0)
-		ereport(FATAL,
-				(errcode_for_file_access(),
-					errmsg_internal("could not create pipe to monitor postmaster death: %m")));
-	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
-		ereport(FATAL,
-				(errcode_for_socket_access(),
-					errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
-
-	ChangeToDataDir();
-
-	/* Create pg_wal directory, if it doesn't exist */
-	if (stat(XLOGDIR, &stat_buf) != 0)
-	{
-		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
-		if (MakePGDirectory(XLOGDIR) < 0)
-		{
-			ereport(ERROR,
-					(errcode_for_file_access(),
-						errmsg("could not create directory \"%s\": %m",
-							   XLOGDIR)));
-			exit(1);
-		}
-	}
-
-	load_file("neon", false);
-
-	if (NULL == WalProposerInit)
-		elog(ERROR, "Neon failed to register WalProposerInit");
-
-	if (NULL == WalProposerStart)
-		elog(ERROR, "Neon failed to register WalProposerStart");
-
-	WalProposerInit(0, 0);
-
-	process_shared_preload_libraries_in_progress = false;
-
-	BackgroundWorkerUnblockSignals();
-
-	WalProposerStart();
-}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index ce16a78a61c..23572609144 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -73,7 +73,6 @@
 #include "replication/slot.h"
 #include "replication/snapbuild.h"
 #include "replication/syncrep.h"
-#include "replication/walpropshim.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index a299be10430..c601970cd97 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -23,6 +23,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
 
+/*
+ * Buffer with target WAL redo page.
+ * We must not evict this page from the buffer pool, but we cannot just keep it pinned because
+ * some WAL redo functions expect the page to not be pinned. So we have a special check in
+ * localbuf.c to prevent this buffer from being evicted.
+ */
+Buffer		wal_redo_buffer;
+bool		am_wal_redo_postgres = false;
 
 /*
  * Data Structures:
diff --git a/src/backend/tcop/Makefile b/src/backend/tcop/Makefile
index 84f027436a4..f662a7dd1cf 100644
--- a/src/backend/tcop/Makefile
+++ b/src/backend/tcop/Makefile
@@ -20,6 +20,4 @@ OBJS = \
 	pquery.o \
 	utility.o
 
-OBJS += zenith_wal_redo.o
-
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
deleted file mode 100644
index 67653170bac..00000000000
--- a/src/backend/tcop/zenith_wal_redo.c
+++ /dev/null
@@ -1,816 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * zenith_wal_redo.c
- *	  Entry point for WAL redo helper
- *
- *
- * This file contains an alternative main() function for the 'postgres'
- * binary. In the special mode, we go into a special mode that's similar
- * to the single user mode. We don't launch postmaster or any auxiliary
- * processes. Instead, we wait for command from 'stdin', and respond to
- * 'stdout'.
- *
- * The protocol through stdin/stdout is loosely based on the libpq protocol.
- * The process accepts messages through stdin, and each message has the format:
- *
- * char   msgtype;
- * int32  length; // length of message including 'length' but excluding
- *                // 'msgtype', in network byte order
- * <payload>
- *
- * There are three message types:
- *
- * BeginRedoForBlock ('B'): Prepare for WAL replay for given block
- * PushPage ('P'): Copy a page image (in the payload) to buffer cache
- * ApplyRecord ('A'): Apply a WAL record (in the payload)
- * GetPage ('G'): Return a page image from buffer cache.
- *
- * Currently, you only get a response to GetPage requests; the response is
- * simply a 8k page, without any headers. Errors are logged to stderr.
- *
- * FIXME:
- * - this currently requires a valid PGDATA, and creates a lock file there
- *   like a normal postmaster. There's no fundamental reason for that, though.
- * - should have EndRedoForBlock, and flush page cache, to allow using this
- *   mechanism for more than one block without restarting the process.
- *
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  src/backend/tcop/zenith_wal_redo.c
- *
- *-------------------------------------------------------------------------
- */
-
-#include "postgres.h"
-
-#include <fcntl.h>
-#include <limits.h>
-#include <signal.h>
-#include <unistd.h>
-#include <sys/socket.h>
-#ifdef HAVE_SYS_SELECT_H
-#include <sys/select.h>
-#endif
-#ifdef HAVE_SYS_RESOURCE_H
-#include <sys/time.h>
-#include <sys/resource.h>
-#endif
-
-#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__)
-#define MALLOC_NO_MMAP
-#include <malloc.h>
-#endif
-
-#ifndef HAVE_GETRUSAGE
-#include "rusagestub.h"
-#endif
-
-#include "access/xlog.h"
-#include "access/xlog_internal.h"
-#include "access/xlogutils.h"
-#include "catalog/pg_class.h"
-#include "libpq/libpq.h"
-#include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/postmaster.h"
-#include "postmaster/seccomp.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "storage/ipc.h"
-#include "storage/proc.h"
-#include "storage/smgr.h"
-#include "tcop/tcopprot.h"
-#include "utils/memutils.h"
-#include "utils/ps_status.h"
-
-static int	ReadRedoCommand(StringInfo inBuf);
-static void BeginRedoForBlock(StringInfo input_message);
-static void PushPage(StringInfo input_message);
-static void ApplyRecord(StringInfo input_message);
-static void apply_error_callback(void *arg);
-static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
-static void GetPage(StringInfo input_message);
-static ssize_t buffered_read(void *buf, size_t count);
-
-static BufferTag target_redo_tag;
-
-/*
- * Buffer with target WAL redo page.
- * We must not evict this page from the buffer pool, but we cannot just keep it pinned because
- * some WAL redo functions expect the page to not be pinned. So we have a special check in
- * localbuf.c to prevent this buffer from being evicted.
- */
-Buffer		wal_redo_buffer;
-bool		am_wal_redo_postgres;
-
-static XLogReaderState *reader_state;
-
-#define TRACE DEBUG5
-
-#ifdef HAVE_LIBSECCOMP
-static void
-enter_seccomp_mode(void)
-{
-	PgSeccompRule syscalls[] =
-	{
-		/* Hard requirements */
-		PG_SCMP_ALLOW(exit_group),
-		PG_SCMP_ALLOW(pselect6),
-		PG_SCMP_ALLOW(read),
-		PG_SCMP_ALLOW(select),
-		PG_SCMP_ALLOW(write),
-
-		/* Memory allocation */
-		PG_SCMP_ALLOW(brk),
-#ifndef MALLOC_NO_MMAP
-		/* TODO: musl doesn't have mallopt */
-		PG_SCMP_ALLOW(mmap),
-		PG_SCMP_ALLOW(munmap),
-#endif
-		/*
-		 * getpid() is called on assertion failure, in ExceptionalCondition.
-		 * It's not really needed, but seems pointless to hide it either. The
-		 * system call unlikely to expose a kernel vulnerability, and the PID
-		 * is stored in MyProcPid anyway.
-		 */
-		PG_SCMP_ALLOW(getpid),
-
-		/* Enable those for a proper shutdown.
-		PG_SCMP_ALLOW(munmap),
-		PG_SCMP_ALLOW(shmctl),
-		PG_SCMP_ALLOW(shmdt),
-		PG_SCMP_ALLOW(unlink), // shm_unlink
-		*/
-	};
-
-#ifdef MALLOC_NO_MMAP
-	/* Ask glibc not to use mmap() */
-	mallopt(M_MMAP_MAX, 0);
-#endif
-
-	seccomp_load_rules(syscalls, lengthof(syscalls));
-}
-#endif
-
-/* ----------------------------------------------------------------
- * FIXME comment
- * PostgresMain
- *	   postgres main loop -- all backends, interactive or otherwise start here
- *
- * argc/argv are the command line arguments to be used.  (When being forked
- * by the postmaster, these are not the original argv array of the process.)
- * dbname is the name of the database to connect to, or NULL if the database
- * name should be extracted from the command line arguments or defaulted.
- * username is the PostgreSQL user name to be used for the session.
- * ----------------------------------------------------------------
- */
-void
-WalRedoMain(int argc, char *argv[],
-			const char *dbname,
-			const char *username)
-{
-	int			firstchar;
-	StringInfoData input_message;
-#ifdef HAVE_LIBSECCOMP
-	bool		enable_seccomp;
-#endif
-
-	/* Initialize startup process environment if necessary. */
-	InitStandaloneProcess(argv[0]);
-
-	SetProcessingMode(InitProcessing);
-	am_wal_redo_postgres = true;
-
-	/*
-	 * Set default values for command-line options.
-	 */
-	InitializeGUCOptions();
-
-	/*
-	 * WAL redo does not need a large number of buffers. And speed of
-	 * DropRelFileNodeAllLocalBuffers() is proportional to the number of
-	 * buffers. So let's keep it small (default value is 1024)
-	 */
-	num_temp_buffers = 4;
-
-	/*
-	 * Parse command-line options.
-	 * TODO
-	 */
-	//process_postgres_switches(argc, argv, PGC_POSTMASTER, &dbname);
-
-	/* Acquire configuration parameters */
-	if (!SelectConfigFiles(NULL, progname))
-		proc_exit(1);
-
-	/*
-	 * Set up signal handlers.  (InitPostmasterChild or InitStandaloneProcess
-	 * has already set up BlockSig and made that the active signal mask.)
-	 *
-	 * Note that postmaster blocked all signals before forking child process,
-	 * so there is no race condition whereby we might receive a signal before
-	 * we have set up the handler.
-	 *
-	 * Also note: it's best not to use any signals that are SIG_IGNored in the
-	 * postmaster.  If such a signal arrives before we are able to change the
-	 * handler to non-SIG_IGN, it'll get dropped.  Instead, make a dummy
-	 * handler in the postmaster to reserve the signal. (Of course, this isn't
-	 * an issue for signals that are locally generated, such as SIGALRM and
-	 * SIGPIPE.)
-	 */
-#if 0
-	if (am_walsender)
-		WalSndSignals();
-	else
-	{
-		pqsignal(SIGHUP, SignalHandlerForConfigReload);
-		pqsignal(SIGINT, StatementCancelHandler);	/* cancel current query */
-		pqsignal(SIGTERM, die); /* cancel current query and exit */
-
-		/*
-		 * In a postmaster child backend, replace SignalHandlerForCrashExit
-		 * with quickdie, so we can tell the client we're dying.
-		 *
-		 * In a standalone backend, SIGQUIT can be generated from the keyboard
-		 * easily, while SIGTERM cannot, so we make both signals do die()
-		 * rather than quickdie().
-		 */
-		if (IsUnderPostmaster)
-			pqsignal(SIGQUIT, quickdie);	/* hard crash time */
-		else
-			pqsignal(SIGQUIT, die); /* cancel current query and exit */
-		InitializeTimeouts();	/* establishes SIGALRM handler */
-
-		/*
-		 * Ignore failure to write to frontend. Note: if frontend closes
-		 * connection, we will notice it and exit cleanly when control next
-		 * returns to outer loop.  This seems safer than forcing exit in the
-		 * midst of output during who-knows-what operation...
-		 */
-		pqsignal(SIGPIPE, SIG_IGN);
-		pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-		pqsignal(SIGUSR2, SIG_IGN);
-		pqsignal(SIGFPE, FloatExceptionHandler);
-
-		/*
-		 * Reset some signals that are accepted by postmaster but not by
-		 * backend
-		 */
-		pqsignal(SIGCHLD, SIG_DFL); /* system() requires this on some
-									 * platforms */
-	}
-#endif
-
-	/*
-	 * Validate we have been given a reasonable-looking DataDir and change into it.
-	 */
-	checkDataDir();
-	ChangeToDataDir();
-
-	/*
-	 * Create lockfile for data directory.
-	 */
-	CreateDataDirLockFile(false);
-
-	/* read control file (error checking and contains config ) */
-	LocalProcessControlFile(false);
-
-	process_shared_preload_libraries();
-
-	/* Initialize MaxBackends (if under postmaster, was done already) */
-	InitializeMaxBackends();
-
-	/* Early initialization */
-	BaseInit();
-
-	/*
-	 * Create a per-backend PGPROC struct in shared memory. We must do
-	 * this before we can use LWLocks.
-	 */
-	InitAuxiliaryProcess();
-
-	SetProcessingMode(NormalProcessing);
-
-	/* Redo routines won't work if we're not "in recovery" */
-	InRecovery = true;
-
-	/*
-	 * Create the memory context we will use in the main loop.
-	 *
-	 * MessageContext is reset once per iteration of the main loop, ie, upon
-	 * completion of processing of each command message from the client.
-	 */
-	MessageContext = AllocSetContextCreate(TopMemoryContext,
-										   "MessageContext",
-										   ALLOCSET_DEFAULT_SIZES);
-
-	/* we need a ResourceOwner to hold buffer pins */
-	Assert(CurrentResourceOwner == NULL);
-	CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo");
-
-	/* Initialize resource managers */
-	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
-	{
-		if (RmgrTable[rmid].rm_startup != NULL)
-			RmgrTable[rmid].rm_startup();
-	}
-	reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL);
-
-#ifdef HAVE_LIBSECCOMP
-	/* We prefer opt-out to opt-in for greater security */
-	enable_seccomp = true;
-	for (int i = 1; i < argc; i++)
-		if (strcmp(argv[i], "--disable-seccomp") == 0)
-			enable_seccomp = false;
-
-	/*
-	 * We deliberately delay the transition to the seccomp mode
-	 * until it's time to enter the main processing loop;
-	 * else we'd have to add a lot more syscalls to the allowlist.
-	 */
-	if (enable_seccomp)
-		enter_seccomp_mode();
-#endif
-
-	/*
-	 * Main processing loop
-	 */
-	MemoryContextSwitchTo(MessageContext);
-	initStringInfo(&input_message);
-
-	for (;;)
-	{
-		/* Release memory left over from prior query cycle. */
-		resetStringInfo(&input_message);
-
-		set_ps_display("idle");
-
-		/*
-		 * (3) read a command (loop blocks here)
-		 */
-		firstchar = ReadRedoCommand(&input_message);
-		switch (firstchar)
-		{
-			case 'B':			/* BeginRedoForBlock */
-				BeginRedoForBlock(&input_message);
-				break;
-
-			case 'P':			/* PushPage */
-				PushPage(&input_message);
-				break;
-
-			case 'A':			/* ApplyRecord */
-				ApplyRecord(&input_message);
-				break;
-
-			case 'G':			/* GetPage */
-				GetPage(&input_message);
-				break;
-
-				/*
-				 * EOF means we're done. Perform normal shutdown.
-				 */
-			case EOF:
-				ereport(LOG,
-						(errmsg("received EOF on stdin, shutting down")));
-
-#ifdef HAVE_LIBSECCOMP
-				/*
-				 * Skip the shutdown sequence, leaving some garbage behind.
-				 * Hopefully, postgres will clean it up in the next run.
-				 * This way we don't have to enable extra syscalls, which is nice.
-				 * See enter_seccomp_mode() above.
-				 */
-				if (enable_seccomp)
-					_exit(0);
-#endif
-				/*
-				 * NOTE: if you are tempted to add more code here, DON'T!
-				 * Whatever you had in mind to do should be set up as an
-				 * on_proc_exit or on_shmem_exit callback, instead. Otherwise
-				 * it will fail to be called during other backend-shutdown
-				 * scenarios.
-				 */
-				proc_exit(0);
-
-			default:
-				ereport(FATAL,
-						(errcode(ERRCODE_PROTOCOL_VIOLATION),
-						 errmsg("invalid frontend message type %d",
-								firstchar)));
-		}
-	}							/* end of input-reading loop */
-}
-
-/*
- * Some debug function that may be handy for now.
- */
-pg_attribute_unused()
-static char *
-pprint_buffer(char *data, int len)
-{
-	StringInfoData s;
-	initStringInfo(&s);
-	appendStringInfo(&s, "\n");
-	for (int i = 0; i < len; i++) {
-
-		appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) );
-		if (i % 32 == 31) {
-			appendStringInfo(&s, "\n");
-		}
-	}
-	appendStringInfo(&s, "\n");
-
-	return s.data;
-}
-
-/* ----------------------------------------------------------------
- *		routines to obtain user input
- * ----------------------------------------------------------------
- */
-
-/*
- * Read next command from the client.
- *
- *	the string entered by the user is placed in its parameter inBuf,
- *	and we act like a Q message was received.
- *
- *	EOF is returned if end-of-file input is seen; time to shut down.
- * ----------------
- */
-static int
-ReadRedoCommand(StringInfo inBuf)
-{
-	ssize_t		ret;
-	char		hdr[1 + sizeof(int32)];
-	int			qtype;
-	int32		len;
-
-	/* Read message type and message length */
-	ret = buffered_read(hdr, sizeof(hdr));
-	if (ret != sizeof(hdr))
-	{
-		if (ret == 0)
-			return EOF;
-		else if (ret < 0)
-			ereport(ERROR,
-					(errcode(ERRCODE_CONNECTION_FAILURE),
-					 errmsg("could not read message header: %m")));
-		else
-			ereport(ERROR,
-					(errcode(ERRCODE_PROTOCOL_VIOLATION),
-					 errmsg("unexpected EOF")));
-	}
-
-	qtype = hdr[0];
-	memcpy(&len, &hdr[1], sizeof(int32));
-	len = pg_ntoh32(len);
-
-	if (len < 4)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROTOCOL_VIOLATION),
-				 errmsg("invalid message length")));
-
-	len -= 4;					/* discount length itself */
-
-	/* Read the message payload */
-	enlargeStringInfo(inBuf, len);
-	ret = buffered_read(inBuf->data, len);
-	if (ret != len)
-	{
-		if (ret < 0)
-			ereport(ERROR,
-					(errcode(ERRCODE_CONNECTION_FAILURE),
-					 errmsg("could not read message: %m")));
-		else
-			ereport(ERROR,
-					(errcode(ERRCODE_PROTOCOL_VIOLATION),
-					 errmsg("unexpected EOF")));
-	}
-	inBuf->len = len;
-	inBuf->data[len] = '\0';
-
-	return qtype;
-}
-
-/*
- * Prepare for WAL replay on given block
- */
-static void
-BeginRedoForBlock(StringInfo input_message)
-{
-	RelFileNode rnode;
-	ForkNumber forknum;
-	BlockNumber blknum;
-	SMgrRelation reln;
-
-	/*
-	 * message format:
-	 *
-	 * spcNode
-	 * dbNode
-	 * relNode
-	 * ForkNumber
-	 * BlockNumber
-	 */
-	forknum = pq_getmsgbyte(input_message);
-	rnode.spcNode = pq_getmsgint(input_message, 4);
-	rnode.dbNode = pq_getmsgint(input_message, 4);
-	rnode.relNode = pq_getmsgint(input_message, 4);
-	blknum = pq_getmsgint(input_message, 4);
-	wal_redo_buffer = InvalidBuffer;
-
-	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
-
-	elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
-		 target_redo_tag.rnode.spcNode,
-		 target_redo_tag.rnode.dbNode,
-		 target_redo_tag.rnode.relNode,
-		 target_redo_tag.forkNum,
-		 target_redo_tag.blockNum);
-
-	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
-	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
-		reln->smgr_cached_nblocks[forknum] < blknum + 1)
-	{
-		reln->smgr_cached_nblocks[forknum] = blknum + 1;
-	}
-}
-
-/*
- * Receive a page given by the client, and put it into buffer cache.
- */
-static void
-PushPage(StringInfo input_message)
-{
-	RelFileNode rnode;
-	ForkNumber forknum;
-	BlockNumber blknum;
-	const char *content;
-	Buffer		buf;
-	Page		page;
-
-	/*
-	 * message format:
-	 *
-	 * spcNode
-	 * dbNode
-	 * relNode
-	 * ForkNumber
-	 * BlockNumber
-	 * 8k page content
-	 */
-	forknum = pq_getmsgbyte(input_message);
-	rnode.spcNode = pq_getmsgint(input_message, 4);
-	rnode.dbNode = pq_getmsgint(input_message, 4);
-	rnode.relNode = pq_getmsgint(input_message, 4);
-	blknum = pq_getmsgint(input_message, 4);
-	content = pq_getmsgbytes(input_message, BLCKSZ);
-
-	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL);
-	wal_redo_buffer = buf;
-	page = BufferGetPage(buf);
-	memcpy(page, content, BLCKSZ);
-	MarkBufferDirty(buf); /* pro forma */
-	UnlockReleaseBuffer(buf);
-}
-
-/*
- * Receive a WAL record, and apply it.
- *
- * All the pages should be loaded into the buffer cache by PushPage calls already.
- */
-static void
-ApplyRecord(StringInfo input_message)
-{
-	char	   *errormsg;
-	XLogRecPtr	lsn;
-	XLogRecord *record;
-	int			nleft;
-	ErrorContextCallback errcallback;
-
-	/*
-	 * message format:
-	 *
-	 * LSN (the *end* of the record)
-	 * record
-	 */
-	lsn = pq_getmsgint64(input_message);
-
-	smgrinit();					/* reset inmem smgr state */
-
-	/* note: the input must be aligned here */
-	record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
-
-	nleft = input_message->len - input_message->cursor;
-	if (record->xl_tot_len != sizeof(XLogRecord) + nleft)
-		elog(ERROR, "mismatch between record (%d) and message size (%d)",
-			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
-
-	/* Setup error traceback support for ereport() */
-	errcallback.callback = apply_error_callback;
-	errcallback.arg = (void *) reader_state;
-	errcallback.previous = error_context_stack;
-	error_context_stack = &errcallback;
-
-	XLogBeginRead(reader_state, lsn);
-	/*
-	 * In lieu of calling XLogReadRecord, store the record 'decoded_record'
-	 * buffer directly.
-	 */
-	reader_state->ReadRecPtr = lsn;
-	reader_state->decoded_record = record;
-	if (!DecodeXLogRecord(reader_state, record, &errormsg))
-		elog(ERROR, "failed to decode WAL record: %s", errormsg);
-
-	/* Ignore any other blocks than the ones the caller is interested in */
-	redo_read_buffer_filter = redo_block_filter;
-
-	RmgrTable[record->xl_rmid].rm_redo(reader_state);
-	/*
-	 * If no base image of the page was provided by PushPage, initialize wal_redo_buffer here.
-	 * The first WAL record must initialize the page in that case.
-	 */
-	if (BufferIsInvalid(wal_redo_buffer))
-	{
-		wal_redo_buffer = ReadBufferWithoutRelcache(target_redo_tag.rnode, target_redo_tag.forkNum, target_redo_tag.blockNum, RBM_NORMAL, NULL);
-		Assert(!BufferIsInvalid(wal_redo_buffer));
-		ReleaseBuffer(wal_redo_buffer);
-	}
-	redo_read_buffer_filter = NULL;
-
-	/* Pop the error context stack */
-	error_context_stack = errcallback.previous;
-
-	elog(TRACE, "applied WAL record with LSN %X/%X",
-		 (uint32) (lsn >> 32), (uint32) lsn);
-}
-
-/*
- * Error context callback for errors occurring during ApplyRecord
- */
-static void
-apply_error_callback(void *arg)
-{
-	XLogReaderState *record = (XLogReaderState *) arg;
-	StringInfoData buf;
-
-	initStringInfo(&buf);
-	xlog_outdesc(&buf, record);
-
-	/* translator: %s is a WAL record description */
-	errcontext("WAL redo at %X/%X for %s",
-			   LSN_FORMAT_ARGS(record->ReadRecPtr),
-			   buf.data);
-
-	pfree(buf.data);
-}
-
-static bool
-redo_block_filter(XLogReaderState *record, uint8 block_id)
-{
-	BufferTag	target_tag;
-
-	if (!XLogRecGetBlockTag(record, block_id,
-							&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum))
-	{
-		/* Caller specified a bogus block_id */
-		elog(PANIC, "failed to locate backup block with ID %d", block_id);
-	}
-
-	/*
-	 * Can a WAL redo function ever access a relation other than the one that
-	 * it modifies? I don't see why it would.
-	 */
-	if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
-		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
-			 target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
-
-	/*
-	 * If this block isn't one we are currently restoring, then return 'true'
-	 * so that this gets ignored
-	 */
-	return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
-}
-
-/*
- * Get a page image back from buffer cache.
- *
- * After applying some records.
- */
-static void
-GetPage(StringInfo input_message)
-{
-	RelFileNode rnode;
-	ForkNumber forknum;
-	BlockNumber blknum;
-	Buffer		buf;
-	Page		page;
-	int			tot_written;
-
-	/*
-	 * message format:
-	 *
-	 * spcNode
-	 * dbNode
-	 * relNode
-	 * ForkNumber
-	 * BlockNumber
-	 */
-	forknum = pq_getmsgbyte(input_message);
-	rnode.spcNode = pq_getmsgint(input_message, 4);
-	rnode.dbNode = pq_getmsgint(input_message, 4);
-	rnode.relNode = pq_getmsgint(input_message, 4);
-	blknum = pq_getmsgint(input_message, 4);
-
-	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
-
-	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_NORMAL, NULL);
-	Assert(buf == wal_redo_buffer);
-	page = BufferGetPage(buf);
-	/* single thread, so don't bother locking the page */
-
-	/* Response: Page content */
-	tot_written = 0;
-	do {
-		ssize_t		rc;
-
-		rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written);
-		if (rc < 0) {
-			/* If interrupted by signal, just retry */
-			if (errno == EINTR)
-				continue;
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not write to stdout: %m")));
-		}
-		tot_written += rc;
-	} while (tot_written < BLCKSZ);
-
-	ReleaseBuffer(buf);
-	DropRelFileNodeAllLocalBuffers(rnode);
-	wal_redo_buffer = InvalidBuffer;
-
-	elog(TRACE, "Page sent back for block %u", blknum);
-}
-
-
-/* Buffer used by buffered_read() */
-static char stdin_buf[16 * 1024];
-static size_t stdin_len = 0;	/* # of bytes in buffer */
-static size_t stdin_ptr = 0;	/* # of bytes already consumed */
-
-/*
- * Like read() on stdin, but buffered.
- *
- * We cannot use libc's buffered fread(), because it uses syscalls that we
- * have disabled with seccomp(). Depending on the platform, it can call
- * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat'
- * seems problematic because it allows interrogating files by path name.
- *
- * The return value is the number of bytes read. On error, -1 is returned, and
- * errno is set appropriately. Unlike read(), this fills the buffer completely
- * unless an error happens or EOF is reached.
- */
-static ssize_t
-buffered_read(void *buf, size_t count)
-{
-	char	   *dst = buf;
-
-	while (count > 0)
-	{
-		size_t		nthis;
-
-		if (stdin_ptr == stdin_len)
-		{
-			ssize_t		ret;
-
-			ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf));
-			if (ret < 0)
-			{
-				/* don't do anything here that could set 'errno' */
-				return ret;
-			}
-			if (ret == 0)
-			{
-				/* EOF */
-				break;
-			}
-			stdin_len = (size_t) ret;
-			stdin_ptr = 0;
-		}
-		nthis = Min(stdin_len - stdin_ptr, count);
-
-		memcpy(dst, &stdin_buf[stdin_ptr], nthis);
-
-		stdin_ptr += nthis;
-		count -= nthis;
-		dst += nthis;
-	}
-
-	return (dst - (char *) buf);
-}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6f1bd459f0d..9dc37cf5b23 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -80,7 +80,6 @@
 #include "replication/syncrep.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
-#include "replication/walpropshim.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index bfbfe5da8c9..fd5e542d95c 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -493,7 +493,7 @@ extern void CancelBackup(void);
 extern size_t get_hash_memory_limit(void);
 extern int	get_hash_mem(void);
 
-/* in src/backend/tcop/zenith_wal_redo.c */
+/* in storage/buffer/buf_init.c */
 extern bool am_wal_redo_postgres;
 
 #endif							/* MISCADMIN_H */
diff --git a/src/include/postmaster/seccomp.h b/src/include/postmaster/seccomp.h
deleted file mode 100644
index 1613d34bd47..00000000000
--- a/src/include/postmaster/seccomp.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef PG_SECCOMP_H
-#define PG_SECCOMP_H
-
-#include "postgres.h"
-
-#ifdef HAVE_LIBSECCOMP
-#include <seccomp.h>
-#endif
-
-typedef struct {
-    int    psr_syscall; /* syscall number */
-    uint32 psr_action;  /* libseccomp action, e.g. SCMP_ACT_ALLOW */
-} PgSeccompRule;
-
-#define PG_SCMP(syscall, action)                \
-    (PgSeccompRule) {                           \
-        .psr_syscall = SCMP_SYS(syscall),       \
-        .psr_action = (action),                 \
-    }
-
-#define PG_SCMP_ALLOW(syscall) \
-    PG_SCMP(syscall, SCMP_ACT_ALLOW)
-
-void seccomp_load_rules(PgSeccompRule *syscalls, int count);
-
-#endif /* PG_SECCOMP_H */
diff --git a/src/include/replication/walpropshim.h b/src/include/replication/walpropshim.h
deleted file mode 100644
index 07757580cc9..00000000000
--- a/src/include/replication/walpropshim.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * walpropshim.h
- *	  various hooks for the walproposer component of the Neon extension.
- */
-
-#ifndef __WALPROPOSER_H__
-#define __WALPROPOSER_H__
-
-/*
- * Set to true only in standalone run of `postgres --sync-safekeepers`.
- * See also the top comment in contrib/neon/walproposer.c
- */
-extern PGDLLIMPORT bool syncSafekeepers;
-extern PGDLLIMPORT void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId);
-extern PGDLLIMPORT void (*WalProposerStart) (void);
-
-void       WalProposerSync(int argc, char *argv[]);
-
-#endif
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 905b6d1ab50..d1025ee1501 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -278,6 +278,8 @@ typedef struct WritebackContext
 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
 
+extern Buffer wal_redo_buffer;
+
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
 
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 9da6e8768ab..968345404e5 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -88,8 +88,4 @@ extern bool set_plan_disabling_options(const char *arg,
 									   GucContext context, GucSource source);
 extern const char *get_stats_option_name(const char *arg);
 
-extern void WalRedoMain(int argc, char *argv[],
-						const char *dbname,
-						const char *username);
-
 #endif							/* TCOPPROT_H */

From 77d07fdf9c8a9b834f0d77556c028454cb13fd5e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 31 Oct 2022 14:25:49 +0100
Subject: [PATCH 177/214] Misc cleanup, mostly to reduce unnecessary
 differences with upstream.

Fix indentation, remove unused definitions, resolve some FIXMEs.
---
 src/backend/access/spgist/spginsert.c   |  1 +
 src/backend/access/transam/xloginsert.c |  3 ---
 src/backend/storage/buffer/bufmgr.c     |  1 +
 src/backend/storage/page/bufpage.c      |  1 +
 src/backend/utils/misc/guc.c            |  2 --
 src/include/access/xlogreader.h         |  2 +-
 src/include/replication/walsender.h     | 15 ++++++++-------
 src/include/storage/bufmgr.h            |  2 --
 8 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 9773592b703..ee754b5fcb8 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -107,6 +107,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
 	MarkBufferDirty(nullbuffer);
 
+
 	END_CRIT_SECTION();
 
 	UnlockReleaseBuffer(metabuffer);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 750cd040a31..a28607682c1 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -122,9 +122,6 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
-/* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */
-#define MB ((XLogRecPtr)1024*1024)
-
 /*
  * Begin constructing a WAL record. This must be called before the
  * XLogRegister* functions and XLogInsert().
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 27eb4f28ca5..d40741d0fb3 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -56,6 +56,7 @@
 #include "utils/timestamp.h"
 #include "replication/walsender.h"
 
+
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 3616846ad07..82ca91f5977 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -427,6 +427,7 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 
 	pageSize = PageGetPageSize(tempPage);
 	memcpy((char *) oldPage, (char *) tempPage, pageSize);
+
 	pfree(tempPage);
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9dc37cf5b23..a463d8e2846 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -185,7 +185,6 @@ static int	syslog_facility = 0;
 static void assign_syslog_facility(int newval, void *extra);
 static void assign_syslog_ident(const char *newval, void *extra);
 static void assign_session_replication_role(int newval, void *extra);
-
 static bool check_temp_buffers(int *newval, void **extra, GucSource source);
 static bool check_bonjour(bool *newval, void **extra, GucSource source);
 static bool check_ssl(bool *newval, void **extra, GucSource source);
@@ -11857,7 +11856,6 @@ assign_session_replication_role(int newval, void *extra)
 		ResetPlanCache();
 }
 
-
 static bool
 check_temp_buffers(int *newval, void **extra, GucSource source)
 {
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index c7fac7bdace..3dd4df7bafa 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -263,7 +263,7 @@ struct XLogReaderState
 	/* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */
 	XLogRecPtr	overwrittenRecPtr;
 
-	/* Disable validation to allow dumpng corrupt WAL */
+	/* Disable validation to allow dumping corrupt WAL */
 	bool skip_page_validation;
 	bool skip_invalid_records;
 	bool skip_lsn_checks;
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index f902457c26b..c740b938494 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -51,7 +51,8 @@ extern void WalSndRqstFileReload(void);
 
 /*
  * Hook to check for WAL receiving backpressure.
- * Return value in microseconds */
+ * Return value in microseconds
+ */
 extern uint64 (*delay_backend_us)(void);
 
 /* expose these so that they can be reused by the neon walproposer extension */
@@ -60,12 +61,12 @@ extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now);
 extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr,
 								XLogRecPtr applyPtr, TimestampTz replyTime,
 								bool replyRequested);
-void       PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
-void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
-									TransactionId feedbackXmin,
-									uint32		feedbackEpoch,
-									TransactionId feedbackCatalogXmin,
-									uint32		feedbackCatalogEpoch);
+extern void PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
+extern void ProcessStandbyHSFeedback(TimestampTz   replyTime,
+									 TransactionId feedbackXmin,
+									 uint32		feedbackEpoch,
+									 TransactionId feedbackCatalogXmin,
+									 uint32		feedbackCatalogEpoch);
 
 /*
  * Remember that we want to wakeup walsenders later
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 40fcdf6d871..6d140786c74 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -227,8 +227,6 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
 
-extern void MarkBufferPermanent(Buffer buffer);
-
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 extern bool ConditionalLockBuffer(Buffer buffer);

From 220ab8ebad0e0b4179f7851cc6ce2093c3ccf8a6 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Mon, 7 Nov 2022 14:40:39 +0100
Subject: [PATCH 178/214] Optimize prefetch patterns in both heap seqscan and
 vacuum scans. (#227)

Previously, we called PrefetchBuffer [NBlkScanned * seqscan_prefetch_buffers]
times in each of those situations, but now only NBlkScanned.

In addition, the prefetch mechanism for the vacuum scans is now based on
blocks instead of tuples - improving the efficiency.
---
 src/backend/access/heap/heapam.c     | 25 ++++++++++---
 src/backend/access/heap/vacuumlazy.c | 56 ++++++++++++++++++++--------
 src/backend/storage/smgr/smgr.c      | 10 -----
 src/include/storage/smgr.h           |  1 -
 4 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 249107cef4a..83c554da4f5 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -400,19 +400,32 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 	CHECK_FOR_INTERRUPTS();
 
 	/* Prefetch next block */
-	if (enable_seqscan_prefetch)
+	if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0)
 	{
-		int prefetch_limit = seqscan_prefetch_buffers;
+		uint32 prefetch_limit = seqscan_prefetch_buffers;
+		BlockNumber	prefetch_start = page;
 		ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata;
+
 		if (pbscanwork != NULL && pbscanwork->phsw_chunk_remaining < prefetch_limit)
 			prefetch_limit = pbscanwork->phsw_chunk_remaining;
-		if (page + prefetch_limit >= scan->rs_nblocks)
-			prefetch_limit = scan->rs_nblocks - page - 1;
+
+		/*
+		 * If this is the first page, initiate prefetch of pages page..page + n.
+		 * On each subsequent call, prefetch the next page that we haven't
+		 * prefetched yet, at page + n.
+		 */
+		if (scan->rs_startblock != page)
+		{
+			prefetch_start = (page + prefetch_limit - 1) % scan->rs_nblocks;
+			prefetch_limit = 1;
+		}
+		else
+			prefetch_start = page;
 
 		RelationOpenSmgr(scan->rs_base.rs_rd);
-		smgr_reset_prefetch(scan->rs_base.rs_rd->rd_smgr);
 		for (int i = 1; i <= prefetch_limit; i++)
-			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, page+i);
+			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM,
+						   (prefetch_start+i) % scan->rs_nblocks);
 	}
 
 	/* read page using selected strategy */
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 99c6a3298c1..df4863672ff 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1218,14 +1218,6 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		 */
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
-		if (enable_seqscan_prefetch)
-		{
-			int prefetch_limit = Min(nblocks - blkno - 1, seqscan_prefetch_buffers);
-			RelationOpenSmgr(vacrel->rel);
-			smgr_reset_prefetch(vacrel->rel->rd_smgr);
-			for (int i = 1; i <= prefetch_limit; i++)
-				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, blkno+i);
-		}
 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
 								 RBM_NORMAL, vacrel->bstrategy);
 
@@ -2323,7 +2315,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel)
 static void
 lazy_vacuum_heap_rel(LVRelState *vacrel)
 {
-	int			tupindex;
+	int			tupindex,
+				ptupindex;
 	BlockNumber vacuumed_pages;
 	PGRUsage	ru0;
 	Buffer		vmbuffer = InvalidBuffer;
@@ -2346,6 +2339,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 	vacuumed_pages = 0;
 
 	tupindex = 0;
+	ptupindex = 0;
 	while (tupindex < vacrel->dead_tuples->num_tuples)
 	{
 		BlockNumber tblk;
@@ -2356,14 +2350,46 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		vacuum_delay_point();
 
 		tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
-		if (enable_seqscan_prefetch)
+
+		if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0)
 		{
-			int prefetch_limit = Min(vacrel->dead_tuples->num_tuples - tupindex - 1, seqscan_prefetch_buffers);
-			RelationOpenSmgr(vacrel->rel);
-			smgr_reset_prefetch(vacrel->rel->rd_smgr);
-			for (int i = 1; i <= prefetch_limit; i++)
-				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex + i]));
+			/*
+			 * If we're just starting out, prefetch N consecutive blocks.
+			 * If not, only the next 1 block
+			 */
+			if (tupindex == 0)
+			{
+				int prefetch_limit = Min(vacrel->dead_tuples->num_tuples - tupindex - 1,
+										 Min(vacrel->rel_pages,
+											 seqscan_prefetch_buffers));
+				BlockNumber prev_prefetch = 0;
+
+				RelationOpenSmgr(vacrel->rel);
+
+				while (++ptupindex < vacrel->dead_tuples->num_tuples &&
+					   prefetch_limit > 0)
+				{
+					ItemPointer ptr = &vacrel->dead_tuples->itemptrs[ptupindex];
+					if (ItemPointerGetBlockNumber(ptr) != prev_prefetch)
+					{
+						prev_prefetch = ItemPointerGetBlockNumber(ptr);
+						prefetch_limit -= 1;
+						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
+					}
+				}
+			}
+			else
+			{
+				BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
+				while (ptupindex < vacrel->dead_tuples->num_tuples)
+				{
+					if (toPrefetch != ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]))
+						break;
+				}
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch);
+			}
 		}
+
 		vacrel->blkno = tblk;
 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
 								 vacrel->bstrategy);
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c44a27bafbf..f1e676bcc3e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -38,7 +38,6 @@ static const f_smgr smgr_md = {
 		.smgr_unlink = mdunlink,
 		.smgr_extend = mdextend,
 		.smgr_prefetch = mdprefetch,
-		.smgr_reset_prefetch = md_reset_prefetch,
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
 		.smgr_writeback = mdwriteback,
@@ -501,15 +500,6 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum);
 }
 
-/*
- *	smgr_reset_prefetch() -- Cancel all previos prefetch requests
- */
-void
-smgr_reset_prefetch(SMgrRelation reln)
-{
-	(*reln->smgr).smgr_reset_prefetch(reln);
-}
-
 /*
  *	smgrread() -- read a particular block from a relation into the supplied
  *				  buffer.
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 3305897bcd5..19e469e1cef 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -157,7 +157,6 @@ extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
-extern void smgr_reset_prefetch(SMgrRelation reln);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum, char *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,

From bbcefaf7e575ad4d1d28b250b7cf7cb295abba7e Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 11 Nov 2022 14:24:25 +0100
Subject: [PATCH 179/214] Fix prefetch issues in parallel scans and vacuum's
 cleanup scan (#234)

Parallel seqscans didn't take their parallelism into account when determining
which block to prefetch, and vacuum's cleanup scan didn't correctly determine
which blocks would need to be prefetched, and could get into an infinite loop.
---
 src/backend/access/heap/heapam.c     | 77 +++++++++++++++++++++++-----
 src/backend/access/heap/vacuumlazy.c | 57 +++++++++++++++-----
 2 files changed, 108 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 83c554da4f5..5bbeb5afbe1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -400,32 +400,81 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 	CHECK_FOR_INTERRUPTS();
 
 	/* Prefetch next block */
-	if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0)
+	if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0 && scan->rs_nblocks > 0)
 	{
-		uint32 prefetch_limit = seqscan_prefetch_buffers;
-		BlockNumber	prefetch_start = page;
+		int64	nblocks;
+		int64	rel_scan_start;
+		int64	rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */
+
+		int64	prefetch_start; /* start block of prefetch requests this iteration */
+		int64	prefetch_end; /* end block of prefetch requests this iteration, if applicable */
 		ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata;
 
-		if (pbscanwork != NULL && pbscanwork->phsw_chunk_remaining < prefetch_limit)
-			prefetch_limit = pbscanwork->phsw_chunk_remaining;
+		Assert(seqscan_prefetch_buffers > 0);
+
+		/*
+		 * Parallel scans look like repeated sequential table scans for
+		 * prefetching; with a scan start at nalloc + ch_remaining - ch_size
+		 */
+		if (pbscanwork != NULL)
+		{
+			rel_scan_start = (BlockNumber) pbscanwork->phsw_nallocated + 1
+				+ pbscanwork->phsw_chunk_remaining
+				- pbscanwork->phsw_chunk_size;
+			rel_scan_end = Min(pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining,
+							   scan->rs_nblocks);
+			nblocks = pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining;
+		}
+		else
+		{
+			rel_scan_start = scan->rs_startblock;
+			rel_scan_end = scan->rs_startblock + scan->rs_nblocks;
+			nblocks = scan->rs_nblocks;
+		}
+
+		Assert(rel_scan_start <= page && page <= rel_scan_end);
 
 		/*
-		 * If this is the first page, initiate prefetch of pages page..page + n.
-		 * On each subsequent call, prefetch the next page that we haven't
-		 * prefetched yet, at page + n.
+		 * If this is the first page of this seqscan, initiate prefetch of
+		 * pages page..page + n. On each subsequent call, prefetch the next
+		 * page that we haven't prefetched yet, at page + n.
+		 * If this is the last page of the prefetch, 
 		 */
-		if (scan->rs_startblock != page)
+		if (rel_scan_start != page)
 		{
-			prefetch_start = (page + prefetch_limit - 1) % scan->rs_nblocks;
-			prefetch_limit = 1;
+			prefetch_start = (page + seqscan_prefetch_buffers - 1);
+
+			prefetch_end = prefetch_start + 1;
+
+			/* If we've wrapped around, add nblocks to get the block number in the [start, end] range */
+			if (page < rel_scan_start)
+				prefetch_start += nblocks;
 		}
 		else
+		{
+			/* first block we're fetching, cannot have wrapped around yet */ 
 			prefetch_start = page;
 
+			prefetch_end = rel_scan_end;
+		}
+
+		/* do not prefetch if the only page we're trying to prefetch is past the end of our scan window */
+		if (prefetch_start > rel_scan_end)
+			prefetch_end = 0;
+
+		if (prefetch_end > prefetch_start + seqscan_prefetch_buffers)
+			prefetch_end = prefetch_start + seqscan_prefetch_buffers;
+
 		RelationOpenSmgr(scan->rs_base.rs_rd);
-		for (int i = 1; i <= prefetch_limit; i++)
-			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM,
-						   (prefetch_start+i) % scan->rs_nblocks);
+
+		while (prefetch_start < prefetch_end)
+		{
+			BlockNumber blckno = (prefetch_start % nblocks);
+			Assert(blckno < nblocks);
+			Assert(blckno < INT_MAX);
+			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno);
+			prefetch_start += 1;
+		}
 	}
 
 	/* read page using selected strategy */
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index df4863672ff..aef76991b1c 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -913,6 +913,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 	BlockNumber nblocks,
 				blkno,
 				next_unskippable_block,
+				next_prefetch_block,
 				next_failsafe_block,
 				next_fsm_block_to_vacuum;
 	PGRUsage	ru0;
@@ -942,6 +943,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 
 	nblocks = RelationGetNumberOfBlocks(vacrel->rel);
 	next_unskippable_block = 0;
+	next_prefetch_block = 0;
 	next_failsafe_block = 0;
 	next_fsm_block_to_vacuum = 0;
 	vacrel->rel_pages = nblocks;
@@ -1218,6 +1220,33 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		 */
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
+		if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0)
+		{
+			/*
+			 * Prefetch seqscan_prefetch_buffers blocks ahead
+			 */
+			uint32 prefetch_budget = seqscan_prefetch_buffers;
+
+			/* never trail behind the current scan */
+			if (next_prefetch_block < blkno)
+				next_prefetch_block = blkno;
+
+			/* but only up to the end of the relation */
+			if (prefetch_budget > vacrel->rel_pages - next_prefetch_block)
+				prefetch_budget = vacrel->rel_pages - next_prefetch_block;
+
+			/* And only up to seqscan_prefetch_buffers ahead of the current vacuum scan */
+			if (next_prefetch_block + prefetch_budget > blkno + seqscan_prefetch_buffers)
+				prefetch_budget = blkno + seqscan_prefetch_buffers - next_prefetch_block;
+
+			/* And only up to the next unskippable block */
+			if (next_prefetch_block + prefetch_budget > next_unskippable_block)
+				prefetch_budget = next_unskippable_block - next_prefetch_block;
+
+			for (; prefetch_budget-- > 0; next_prefetch_block++)
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, next_prefetch_block);
+		}
+
 		buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
 								 RBM_NORMAL, vacrel->bstrategy);
 
@@ -2357,36 +2386,40 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 			 * If we're just starting out, prefetch N consecutive blocks.
 			 * If not, only the next 1 block
 			 */
-			if (tupindex == 0)
+			if (ptupindex == 0)
 			{
-				int prefetch_limit = Min(vacrel->dead_tuples->num_tuples - tupindex - 1,
-										 Min(vacrel->rel_pages,
-											 seqscan_prefetch_buffers));
-				BlockNumber prev_prefetch = 0;
+				int prefetch_budget = Min(vacrel->dead_tuples->num_tuples,
+										  Min(vacrel->rel_pages,
+											  seqscan_prefetch_buffers));
+				BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
 
 				RelationOpenSmgr(vacrel->rel);
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
 
 				while (++ptupindex < vacrel->dead_tuples->num_tuples &&
-					   prefetch_limit > 0)
+					   prefetch_budget > 0)
 				{
 					ItemPointer ptr = &vacrel->dead_tuples->itemptrs[ptupindex];
 					if (ItemPointerGetBlockNumber(ptr) != prev_prefetch)
 					{
 						prev_prefetch = ItemPointerGetBlockNumber(ptr);
-						prefetch_limit -= 1;
+						prefetch_budget -= 1;
 						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
 					}
 				}
 			}
-			else
+			else if (ptupindex < vacrel->dead_tuples->num_tuples)
 			{
-				BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
-				while (ptupindex < vacrel->dead_tuples->num_tuples)
+				BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
+				while (++ptupindex < vacrel->dead_tuples->num_tuples)
 				{
-					if (toPrefetch != ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]))
+					BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
+					if (previous != toPrefetch)
+					{
+						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch);
 						break;
+					}
 				}
-				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch);
 			}
 		}
 

From 428d1adfe5eb22b61d8a3d60bc5dad1093963441 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Nov 2022 15:31:31 +0200
Subject: [PATCH 180/214] Use prefetch in pg_prewarm extension (#236)

* Use prefetch in pg_prewarm extension

* Change prefetch order as suggested in review
---
 contrib/pg_prewarm/pg_prewarm.c       | 10 ++++++++--
 contrib/pg_prewarm/pg_prewarm.control |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index 00438239749..3a939b672b0 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -18,6 +18,7 @@
 #include "access/relation.h"
 #include "fmgr.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "storage/bufmgr.h"
 #include "storage/smgr.h"
 #include "utils/acl.h"
@@ -183,14 +184,19 @@ pg_prewarm(PG_FUNCTION_ARGS)
 	}
 	else if (ptype == PREWARM_BUFFER)
 	{
+		BlockNumber prefetch_block = first_block;
 		/*
 		 * In buffer mode, we actually pull the data into shared_buffers.
 		 */
 		for (block = first_block; block <= last_block; ++block)
 		{
-			Buffer		buf;
-
+			Buffer buf;
+			int prefetch_stop = block + Min(last_block - block + 1, seqscan_prefetch_buffers);
 			CHECK_FOR_INTERRUPTS();
+			while (prefetch_block < prefetch_stop)
+			{
+				PrefetchBuffer(rel, forkNumber, prefetch_block++);
+			}
 			buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
 			ReleaseBuffer(buf);
 			++blocks_done;
diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control
index 40e3add4810..d40d1a000b7 100644
--- a/contrib/pg_prewarm/pg_prewarm.control
+++ b/contrib/pg_prewarm/pg_prewarm.control
@@ -3,3 +3,4 @@ comment = 'prewarm relation data'
 default_version = '1.2'
 module_pathname = '$libdir/pg_prewarm'
 relocatable = true
+trusted = true

From 02b845e9860a9209e123e51fc79460bd12ffa205 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 23 Nov 2022 12:54:52 +0100
Subject: [PATCH 181/214] PG14: Prefetch cleanup (#242)

* Update prefetch mechanisms:

- **Enable enable_seqscan_prefetch by default**
- Store prefetch distance in the relevant scan structs
- Slow start sequential scan, to accommodate LIMIT clauses.
- Replace seqscan_prefetch_buffer with the relations' tablespaces'
  *_io_concurrency; and drop seqscan_prefetch_buffer as a result.
- Clarify enable_seqscan_prefetch GUC description
- Fix prefetch in pg_prewarm
- Add prefetching to autoprewarm worker
- Fix an issue where we'd incorrectly not prefetch data when hitting a table wraparound. The same issue also resulted in assertion failures in debug builds.
- Fix parallel scan prefetching - we didn't take into account that parallel scans have scan synchronization, too.
---
 contrib/pg_prewarm/autoprewarm.c       | 34 +++++++++++
 contrib/pg_prewarm/pg_prewarm.c        | 10 +++-
 src/backend/access/heap/heapam.c       | 82 +++++++++++++++++++-------
 src/backend/access/heap/vacuumlazy.c   | 41 +++++++------
 src/backend/optimizer/path/costsize.c  |  1 -
 src/backend/utils/misc/guc.c           | 18 +-----
 src/include/access/heapam.h            |  4 ++
 src/include/optimizer/cost.h           |  1 -
 src/test/regress/expected/sysviews.out |  2 +-
 9 files changed, 133 insertions(+), 60 deletions(-)

diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 0289ea657cb..1fa54a5b00a 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -53,6 +53,7 @@
 #include "utils/rel.h"
 #include "utils/relfilenodemap.h"
 #include "utils/resowner.h"
+#include "utils/spccache.h"
 
 #define AUTOPREWARM_FILE "autoprewarm.blocks"
 
@@ -433,10 +434,12 @@ void
 autoprewarm_database_main(Datum main_arg)
 {
 	int			pos;
+	int			io_concurrency;
 	BlockInfoRecord *block_info;
 	Relation	rel = NULL;
 	BlockNumber nblocks = 0;
 	BlockInfoRecord *old_blk = NULL;
+	BlockInfoRecord *prefetch_blk = NULL;
 	dsm_segment *seg;
 
 	/* Establish signal handlers; once that's done, unblock signals. */
@@ -502,6 +505,8 @@ autoprewarm_database_main(Datum main_arg)
 
 			if (!rel)
 				CommitTransactionCommand();
+			else
+				io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
 		}
 		if (!rel)
 		{
@@ -534,6 +539,35 @@ autoprewarm_database_main(Datum main_arg)
 			continue;
 		}
 
+		/* if prefetching is enabled for this relation */
+		if (io_concurrency > 0)
+		{
+			/* make prefetch_blk catch up */
+			if (blk > prefetch_blk)
+			{
+				prefetch_blk = blk;
+			}
+
+			/* now, prefetch all following blocks */
+			while (prefetch_blk <= &block_info[apw_state->prewarm_stop_idx])
+			{
+				/* unless they're of a different relfilenode */
+				if (prefetch_blk->filenode != blk->filenode ||
+					prefetch_blk->forknum != blk->forknum ||
+					prefetch_blk->blocknum >= nblocks)
+					break;
+
+				/* or unless they are more than io_concurrency blocks ahead */
+				if (blk + io_concurrency <= prefetch_blk)
+					break;
+
+				PrefetchBuffer(rel, prefetch_blk->forknum, prefetch_blk->blocknum);
+
+				/* continue with the next block */
+				prefetch_blk++;
+			}
+		}
+
 		/* Prewarm buffer. */
 		buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
 								 NULL);
diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index 3a939b672b0..1e7a99affad 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -25,6 +25,7 @@
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 PG_MODULE_MAGIC;
 
@@ -185,13 +186,20 @@ pg_prewarm(PG_FUNCTION_ARGS)
 	else if (ptype == PREWARM_BUFFER)
 	{
 		BlockNumber prefetch_block = first_block;
+		Oid			nspOid;
+		int			io_concurrency;
+
+		nspOid = rel->rd_rel->reltablespace;
+		io_concurrency = get_tablespace_maintenance_io_concurrency(nspOid);
+
 		/*
 		 * In buffer mode, we actually pull the data into shared_buffers.
 		 */
 		for (block = first_block; block <= last_block; ++block)
 		{
 			Buffer buf;
-			int prefetch_stop = block + Min(last_block - block + 1, seqscan_prefetch_buffers);
+			BlockNumber prefetch_stop = block + Min(last_block - block + 1,
+													io_concurrency);
 			CHECK_FOR_INTERRUPTS();
 			while (prefetch_block < prefetch_stop)
 			{
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5bbeb5afbe1..18283aa38aa 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -317,6 +317,27 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
 		scan->rs_startblock = 0;
 	}
 
+	if (enable_seqscan_prefetch)
+	{
+		/*
+		 * Do not use tablespace setting for catalog scans, as we might have
+		 * the tablespace settings in the catalogs locked already, which
+		 * might result in a deadlock.
+		 */
+		if (IsCatalogRelation(scan->rs_base.rs_rd))
+			scan->rs_prefetch_maximum = effective_io_concurrency;
+		else
+			scan->rs_prefetch_maximum =
+				get_tablespace_io_concurrency(scan->rs_base.rs_rd->rd_rel->reltablespace);
+
+		scan->rs_prefetch_target = 1;
+	}
+	else
+	{
+		scan->rs_prefetch_maximum = -1;
+		scan->rs_prefetch_target = -1;
+	}
+
 	scan->rs_numblocks = InvalidBlockNumber;
 	scan->rs_inited = false;
 	scan->rs_ctup.t_data = NULL;
@@ -400,17 +421,17 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 	CHECK_FOR_INTERRUPTS();
 
 	/* Prefetch next block */
-	if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0 && scan->rs_nblocks > 0)
+	if (scan->rs_prefetch_maximum > 0 && scan->rs_nblocks > 1)
 	{
 		int64	nblocks;
 		int64	rel_scan_start;
 		int64	rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */
+		int64	scan_pageoff; /* page, but adjusted for scan position */
 
 		int64	prefetch_start; /* start block of prefetch requests this iteration */
 		int64	prefetch_end; /* end block of prefetch requests this iteration, if applicable */
 		ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata;
-
-		Assert(seqscan_prefetch_buffers > 0);
+		ParallelBlockTableScanDesc pbscandesc = (ParallelBlockTableScanDesc) sscan->rs_parallel;
 
 		/*
 		 * Parallel scans look like repeated sequential table scans for
@@ -418,12 +439,20 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 		 */
 		if (pbscanwork != NULL)
 		{
-			rel_scan_start = (BlockNumber) pbscanwork->phsw_nallocated + 1
-				+ pbscanwork->phsw_chunk_remaining
+			uint64	start_offset,
+					end_offset;
+
+			Assert(pbscandesc != NULL);
+			start_offset = pbscanwork->phsw_nallocated
+				+ pbscanwork->phsw_chunk_remaining + 1
 				- pbscanwork->phsw_chunk_size;
-			rel_scan_end = Min(pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining,
-							   scan->rs_nblocks);
-			nblocks = pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining;
+			end_offset = Min(pbscanwork->phsw_nallocated + 
+							 pbscanwork->phsw_chunk_remaining + 1,
+							 pbscandesc->phs_nblocks);
+
+			rel_scan_start = (int64) (pbscandesc->phs_startblock) + start_offset;
+			rel_scan_end = (int64) (pbscandesc->phs_startblock) + end_offset;
+			nblocks = pbscandesc->phs_nblocks;
 		}
 		else
 		{
@@ -432,7 +461,14 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 			nblocks = scan->rs_nblocks;
 		}
 
-		Assert(rel_scan_start <= page && page <= rel_scan_end);
+		prefetch_end = rel_scan_end;
+
+		if ((uint64) page < rel_scan_start)
+			scan_pageoff = page + nblocks;
+		else
+			scan_pageoff = page;
+
+		Assert(rel_scan_start <= scan_pageoff && scan_pageoff <= rel_scan_end);
 
 		/*
 		 * If this is the first page of this seqscan, initiate prefetch of
@@ -442,19 +478,12 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 		 */
 		if (rel_scan_start != page)
 		{
-			prefetch_start = (page + seqscan_prefetch_buffers - 1);
-
+			prefetch_start = scan_pageoff + (int64) scan->rs_prefetch_target - 1;
 			prefetch_end = prefetch_start + 1;
-
-			/* If we've wrapped around, add nblocks to get the block number in the [start, end] range */
-			if (page < rel_scan_start)
-				prefetch_start += nblocks;
 		}
 		else
 		{
-			/* first block we're fetching, cannot have wrapped around yet */ 
-			prefetch_start = page;
-
+			prefetch_start = scan_pageoff;
 			prefetch_end = rel_scan_end;
 		}
 
@@ -462,8 +491,11 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 		if (prefetch_start > rel_scan_end)
 			prefetch_end = 0;
 
-		if (prefetch_end > prefetch_start + seqscan_prefetch_buffers)
-			prefetch_end = prefetch_start + seqscan_prefetch_buffers;
+		if (prefetch_end > prefetch_start + scan->rs_prefetch_target)
+			prefetch_end = prefetch_start + scan->rs_prefetch_target;
+
+		if (prefetch_end > rel_scan_end)
+			prefetch_end = rel_scan_end;
 
 		RelationOpenSmgr(scan->rs_base.rs_rd);
 
@@ -475,6 +507,16 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno);
 			prefetch_start += 1;
 		}
+
+		/*
+		 * Use exponential growth of readahead up to prefetch_maximum, to
+		 * make sure that a low LIMIT does not result in high IO overhead,
+		 * but operations in general are still very fast.
+		 */
+		if (scan->rs_prefetch_target < scan->rs_prefetch_maximum / 2)
+			scan->rs_prefetch_target *= 2;
+		else if (scan->rs_prefetch_target < scan->rs_prefetch_maximum)
+			scan->rs_prefetch_target = scan->rs_prefetch_maximum;
 	}
 
 	/* read page using selected strategy */
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index aef76991b1c..a87cb08e063 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -80,6 +80,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/pg_rusage.h"
+#include "utils/spccache.h"
 #include "utils/timestamp.h"
 
 
@@ -310,6 +311,9 @@ typedef struct LVRelState
 	Relation   *indrels;
 	int			nindexes;
 
+	/* prefetch */
+	int			io_concurrency;
+
 	/* Wraparound failsafe has been triggered? */
 	bool		failsafe_active;
 	/* Consider index vacuuming bypass optimization? */
@@ -556,6 +560,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 
 	/* Set up high level stuff about rel */
 	vacrel->rel = rel;
+	vacrel->io_concurrency =
+		get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
 	vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
 					 &vacrel->indrels);
 	vacrel->failsafe_active = false;
@@ -1220,12 +1226,12 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		 */
 		visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 
-		if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0)
+		if (vacrel->io_concurrency > 0)
 		{
 			/*
-			 * Prefetch seqscan_prefetch_buffers blocks ahead
+			 * Prefetch io_concurrency blocks ahead
 			 */
-			uint32 prefetch_budget = seqscan_prefetch_buffers;
+			uint32 prefetch_budget = vacrel->io_concurrency;
 
 			/* never trail behind the current scan */
 			if (next_prefetch_block < blkno)
@@ -1235,9 +1241,9 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 			if (prefetch_budget > vacrel->rel_pages - next_prefetch_block)
 				prefetch_budget = vacrel->rel_pages - next_prefetch_block;
 
-			/* And only up to seqscan_prefetch_buffers ahead of the current vacuum scan */
-			if (next_prefetch_block + prefetch_budget > blkno + seqscan_prefetch_buffers)
-				prefetch_budget = blkno + seqscan_prefetch_buffers - next_prefetch_block;
+			/* And only up to io_concurrency ahead of the current vacuum scan */
+			if (next_prefetch_block + prefetch_budget > blkno + vacrel->io_concurrency)
+				prefetch_budget = blkno + vacrel->io_concurrency - next_prefetch_block;
 
 			/* And only up to the next unskippable block */
 			if (next_prefetch_block + prefetch_budget > next_unskippable_block)
@@ -2380,42 +2386,35 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 
 		tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
 
-		if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0)
+		if (vacrel->io_concurrency > 0)
 		{
 			/*
 			 * If we're just starting out, prefetch N consecutive blocks.
 			 * If not, only the next 1 block
 			 */
-			if (ptupindex == 0)
-			{
+			if (ptupindex == 0) {
 				int prefetch_budget = Min(vacrel->dead_tuples->num_tuples,
 										  Min(vacrel->rel_pages,
-											  seqscan_prefetch_buffers));
+											  vacrel->io_concurrency));
 				BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
 
 				RelationOpenSmgr(vacrel->rel);
 				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
 
 				while (++ptupindex < vacrel->dead_tuples->num_tuples &&
-					   prefetch_budget > 0)
-				{
+					   prefetch_budget > 0) {
 					ItemPointer ptr = &vacrel->dead_tuples->itemptrs[ptupindex];
-					if (ItemPointerGetBlockNumber(ptr) != prev_prefetch)
-					{
+					if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) {
 						prev_prefetch = ItemPointerGetBlockNumber(ptr);
 						prefetch_budget -= 1;
 						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
 					}
 				}
-			}
-			else if (ptupindex < vacrel->dead_tuples->num_tuples)
-			{
+			} else if (ptupindex < vacrel->dead_tuples->num_tuples) {
 				BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
-				while (++ptupindex < vacrel->dead_tuples->num_tuples)
-				{
+				while (++ptupindex < vacrel->dead_tuples->num_tuples) {
 					BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[ptupindex]);
-					if (previous != toPrefetch)
-					{
+					if (previous != toPrefetch) {
 						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch);
 						break;
 					}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c0c2d4a9aa8..ba539cc15b5 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -151,7 +151,6 @@ bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
 bool		enable_async_append = true;
 bool        enable_seqscan_prefetch = true;
-int         seqscan_prefetch_buffers = 0;
 
 typedef struct
 {
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a463d8e2846..994a6da3695 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -956,14 +956,13 @@ static const unit_conversion time_unit_conversion_table[] =
 static struct config_bool ConfigureNamesBool[] =
 {
 	{
-		{"enable_seqscan_prefetch", PGC_USERSET, QUERY_TUNING_METHOD,
-			gettext_noop("Enables the sequence scan next page prefetching."),
+		{"enable_seqscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Enables prefetching of next pages in sequential scans."),
 			NULL,
 			GUC_EXPLAIN
 		},
 		&enable_seqscan_prefetch,
-		false, /* temporary disable to be able to merge in main */
-		/* true, */
+		true,
 		NULL, NULL, NULL
 	},
 	{
@@ -2154,17 +2153,6 @@ static struct config_bool ConfigureNamesBool[] =
 
 static struct config_int ConfigureNamesInt[] =
 {
-	{
-		{"seqscan_prefetch_buffers", PGC_USERSET, QUERY_TUNING_METHOD,
-		 gettext_noop("Number of subsequent buffer to be prefetched during sequential scan."),
-		 NULL,
-		 GUC_EXPLAIN
-		},
-		&seqscan_prefetch_buffers,
-		/* 8, 0, 1000, */
-		0, 0, 1000, /* temporary disable to be able to merge in main */
-		NULL, NULL, NULL
-	},
 	{
 		{"archive_timeout", PGC_SIGHUP, WAL_ARCHIVING,
 			gettext_noop("Forces a switch to the next WAL file if a "
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 4f1dff9ca1b..fd67d500dc6 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -71,6 +71,10 @@ typedef struct HeapScanDescData
 	 */
 	ParallelBlockTableScanWorkerData *rs_parallelworkerdata;
 
+	/* prefetch info. rs_prefetch_maximum = -1 when disabled.  */
+	int			rs_prefetch_maximum; /* io_concurrency of tablespace */
+	int			rs_prefetch_target; /* current readahead target */
+
 	/* these fields only used in page-at-a-time mode and for bitmap scans */
 	int			rs_cindex;		/* current tuple's index in vistuples */
 	int			rs_ntuples;		/* number of visible tuples on page */
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 155d24cc2ac..8639b59687c 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -68,7 +68,6 @@ extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
 extern PGDLLIMPORT bool enable_async_append;
 extern PGDLLIMPORT bool enable_seqscan_prefetch;
-extern PGDLLIMPORT int seqscan_prefetch_buffers;
 extern PGDLLIMPORT int constraint_exclusion;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 89aebb5e10b..4453fcaa517 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -120,7 +120,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_partitionwise_aggregate | off
  enable_partitionwise_join      | off
  enable_seqscan                 | on
- enable_seqscan_prefetch        | off
+ enable_seqscan_prefetch        | on
  enable_sort                    | on
  enable_tidscan                 | on
 (21 rows)

From b19607021d9e64902066ebe94b78da8878a7d1fc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 24 Nov 2022 11:38:10 +0200
Subject: [PATCH 182/214] Do not produce open file error for unlogged relations
 (#239)

---
 src/backend/storage/smgr/md.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index c7f2c647851..d843b9c207c 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -778,7 +778,13 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber nblocks;
 	BlockNumber segno;
 
-	mdopenfork(reln, forknum, EXTENSION_FAIL);
+	/* NEON: md smgr is used in Neon for unlogged and temp relations.
+	 * After compute node restart their data is deleted but unlogged tables are still present in system catalog.
+	 * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination.
+	 * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL
+	 */
+	if (!mdopenfork(reln, forknum, RelFileNodeBackendIsTemp(reln->smgr_rnode) ? EXTENSION_FAIL : EXTENSION_RETURN_NULL))
+		return 0;
 
 	/* mdopen has opened the first segment */
 	Assert(reln->md_num_open_segs[forknum] > 0);

From 13f436425fd53e11fd2d67032602038dec180db4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 24 Nov 2022 11:46:25 +0200
Subject: [PATCH 183/214] =?UTF-8?q?Maintain=20last=20written=20LSN=20for?=
 =?UTF-8?q?=20each=20page=20to=20enable=20prefetch=20on=20vacuum,=E2=80=A6?=
 =?UTF-8?q?=20(#244)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Maintain last written LSN for each page to enable prefetch on vacuum, delete and other massive update operations

* Move PageSetLSN in heap_xlog_visible before MarkBufferDirty
---
 src/backend/access/heap/heapam.c  | 12 ++++++++++--
 src/backend/access/transam/xlog.c | 26 +++++++++-----------------
 src/backend/utils/misc/guc.c      |  4 ++--
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 18283aa38aa..3bb107ab7a4 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8863,8 +8863,16 @@ heap_xlog_visible(XLogReaderState *record)
 
 		PageSetAllVisible(page);
 
-		if (XLogHintBitIsNeeded())
-			PageSetLSN(page, lsn);
+		/*
+		 * NEON: despite to the comment above we need to update page LSN here.
+		 * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462
+		 * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't
+		 * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected.
+		 *
+		 * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea
+		 * but until it is merged we still need to carry a patch here.
+		 */
+		PageSetLSN(page, lsn);
 
 		MarkBufferDirty(buffer);
 	}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 144bf0e3432..74f02abe923 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -195,7 +195,7 @@ typedef struct LastWrittenLsnCacheEntry
 
 
 /*
- * Cache of last written LSN for each relation chunk (hash bucket).
+ * Cache of last written LSN for each relation page.
  * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
  * relation metadata update.
  * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
@@ -804,8 +804,6 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
  */
 static ControlFileData *ControlFile = NULL;
 
-#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
-
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
  * multiple evaluation!
@@ -8950,7 +8948,7 @@ GetInsertRecPtr(void)
  * It returns an upper bound for the last written LSN of a given page,
  * either from a cached last written LSN or a global maximum last written LSN.
  * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
- * If cache is large enough ,iterting through all hash items may be rather expensive.
+ * If cache is large enough, iterating through all hash items may be rather expensive.
  * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
  */
 XLogRecPtr
@@ -8969,7 +8967,7 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
 		BufferTag key;
 		key.rnode = rnode;
 		key.forkNum = forknum;
-		key.blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET;
+		key.blockNum = blkno;
 		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
 		if (entry != NULL)
 			lsn = entry->lsn;
@@ -8993,9 +8991,9 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
 /*
  * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
  * We maintain cache of last written LSNs with limited size and LRU replacement
- * policy. To reduce cache size we store max LSN not for each page, but for
- * bucket (1024 blocks). This cache allows to use old LSN when
- * requesting pages of unchanged or appended relations.
+ * policy. Keeping last written LSN for each page allows to use old LSN when
+ * requesting pages of unchanged or appended relations. Also it is critical for
+ * efficient work of prefetch in case massive update operations (like vacuum or remove).
  *
  * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated.
  * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions.
@@ -9017,19 +9015,13 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for
 		LastWrittenLsnCacheEntry* entry;
 		BufferTag key;
 		bool found;
-		BlockNumber bucket;
-		BlockNumber start_bucket; /* inclusive */
-		BlockNumber end_bucket;   /* exclusive */
-
-		start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
-		end_bucket = from == REL_METADATA_PSEUDO_BLOCKNO
-			? start_bucket + 1 : (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET;
+		BlockNumber i;
 
 		key.rnode = rnode;
 		key.forkNum = forknum;
-		for (bucket = start_bucket; bucket < end_bucket; bucket++)
+		for (i = 0; i < n_blocks; i++)
 		{
-			key.blockNum = bucket;
+			key.blockNum = from + i;
 			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
 			if (found)
 			{
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 994a6da3695..c6a054d6c73 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2367,11 +2367,11 @@ static struct config_int ConfigureNamesInt[] =
 
 	{
 		{"lsn_cache_size", PGC_POSTMASTER, UNGROUPED,
-			gettext_noop("Size of las written LSN cache used by Neon."),
+			gettext_noop("Size of last written LSN cache used by Neon."),
 			NULL
 		},
 		&lastWrittenLsnCacheSize,
-		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */
+		128*1024, 1024, INT_MAX,
 		NULL, NULL, NULL
 	},
 

From 01c1dfb7612a3e278c1b5d5b48a8e8e0365f9953 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Mon, 5 Dec 2022 16:19:10 +0100
Subject: [PATCH 184/214] Prefetch cleanup: (#247)

- Prefetch the pages in index vacuum's sequential scans
   Implemented in NBTREE, GIST and SP-GIST.
   BRIN does not have a 2nd phase of vacuum, and both GIN and HASH clean up
   their indexes in a non-seqscan fashion: GIN scans the btree from left to
   right, and HASH only scans the initial buckets sequentially.
---
 src/backend/access/gist/gistvacuum.c  | 18 ++++++++++++++++++
 src/backend/access/hash/hash.c        | 16 ++++++++++++++++
 src/backend/access/nbtree/nbtree.c    | 18 ++++++++++++++++++
 src/backend/access/spgist/spgvacuum.c | 17 ++++++++++++++++-
 4 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 0663193531a..5e4350fd33e 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -23,6 +23,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/memutils.h"
+#include "utils/spccache.h"
 
 /* Working state needed by gistbulkdelete */
 typedef struct
@@ -130,8 +131,12 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	BlockNumber num_pages;
 	bool		needLock;
 	BlockNumber blkno;
+	BlockNumber prefetch_blkno;
+	int			io_concurrency;
 	MemoryContext oldctx;
 
+	io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+
 	/*
 	 * Reset fields that track information about the entire index now.  This
 	 * avoids double-counting in the case where a single VACUUM command
@@ -203,6 +208,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	needLock = !RELATION_IS_LOCAL(rel);
 
 	blkno = GIST_ROOT_BLKNO;
+	prefetch_blkno = blkno;
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -215,9 +221,21 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
+
+		if (prefetch_blkno < blkno)
+			prefetch_blkno = blkno;
+		for (; prefetch_blkno < num_pages &&
+			   prefetch_blkno < blkno + io_concurrency; prefetch_blkno++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; blkno < num_pages; blkno++)
+		{
+			if (io_concurrency > 0 && prefetch_blkno < num_pages)
+				PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++);
+
 			gistvacuumpage(&vstate, blkno, blkno);
+		}
 	}
 
 	/*
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0752fb38a92..b672aa5021c 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -31,6 +31,7 @@
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 /* Working state for hashbuild and its callback */
 typedef struct
@@ -465,13 +466,17 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Bucket		orig_maxbucket;
 	Bucket		cur_maxbucket;
 	Bucket		cur_bucket;
+	Bucket		prf_bucket;
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	int			io_concurrency;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
 
+	io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+
 	/*
 	 * We need a copy of the metapage so that we can use its hashm_spares[]
 	 * values to compute bucket page addresses, but a cached copy should be
@@ -486,9 +491,14 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 	/* Scan the buckets that we know exist */
 	cur_bucket = 0;
+	prf_bucket = cur_bucket;
 	cur_maxbucket = orig_maxbucket;
 
 loop_top:
+	for (; prf_bucket <= cur_maxbucket &&
+		   prf_bucket < cur_bucket + io_concurrency; prf_bucket++)
+		PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket));
+
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -499,6 +509,12 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		Page		page;
 		bool		split_cleanup = false;
 
+		if (io_concurrency > 0 && prf_bucket <= cur_maxbucket)
+		{
+			PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket));
+			prf_bucket++;
+		}
+
 		/* Get address of bucket's start page */
 		bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 64ebf58fed2..d6fef1490d5 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -36,6 +36,7 @@
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
+#include "utils/spccache.h"
 
 
 /*
@@ -908,6 +909,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	BTVacState	vstate;
 	BlockNumber num_pages;
 	BlockNumber scanblkno;
+	BlockNumber prefetch_blkno;
+	int			io_concurrency;
 	bool		needLock;
 
 	/*
@@ -947,6 +950,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.maxbufsize = 0;
 	vstate.pendingpages = NULL;
 	vstate.npendingpages = 0;
+
+	io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+
 	/* Consider applying _bt_pendingfsm_finalize optimization */
 	_bt_pendingfsm_init(rel, &vstate, (callback == NULL));
 
@@ -976,6 +982,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	needLock = !RELATION_IS_LOCAL(rel);
 
 	scanblkno = BTREE_METAPAGE + 1;
+	prefetch_blkno = scanblkno;
+
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -992,9 +1000,19 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		/* Quit if we've scanned the whole relation */
 		if (scanblkno >= num_pages)
 			break;
+
+		if (prefetch_blkno < scanblkno)
+			prefetch_blkno = scanblkno;
+		for (; prefetch_blkno < num_pages &&
+			   prefetch_blkno < scanblkno + io_concurrency; prefetch_blkno++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; scanblkno < num_pages; scanblkno++)
 		{
+			if (io_concurrency > 0 && prefetch_blkno < num_pages)
+				PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++);
+
 			btvacuumpage(&vstate, scanblkno);
 			if (info->report_progress)
 				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index 76fb0374c42..de39bc22cb6 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -27,6 +27,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/snapmgr.h"
+#include "utils/spccache.h"
 
 
 /* Entry in pending-list of TIDs we need to revisit */
@@ -796,7 +797,11 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	Relation	index = bds->info->index;
 	bool		needLock;
 	BlockNumber num_pages,
-				blkno;
+				blkno,
+				prefetch_blkno;
+	int			io_concurrency;
+
+	io_concurrency = get_tablespace_maintenance_io_concurrency(index->rd_rel->reltablespace);
 
 	/* Finish setting up spgBulkDeleteState */
 	initSpGistState(&bds->spgstate, index);
@@ -836,9 +841,19 @@ spgvacuumscan(spgBulkDeleteState *bds)
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
+
+		if (prefetch_blkno < blkno)
+			prefetch_blkno = blkno;
+		for (; prefetch_blkno < num_pages &&
+			   prefetch_blkno < blkno + io_concurrency; prefetch_blkno++)
+			PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; blkno < num_pages; blkno++)
 		{
+			if (io_concurrency > 0 && prefetch_blkno < num_pages)
+				PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno++);
+
 			spgvacuumpage(bds, blkno);
 			/* empty the pending-list after each page */
 			if (bds->pendingList != NULL)

From ddc9d478ef70d39188b3ca8d42225be873a09426 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 7 Dec 2022 14:32:02 +0100
Subject: [PATCH 185/214] Fix uninitialized variable in spgvacuum.c (#250)

The compiler warning was correct and would have the potential to disable prefetching.
---
 src/backend/access/spgist/spgvacuum.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index de39bc22cb6..8db596b1cb8 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -829,6 +829,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	 * in btvacuumscan().
 	 */
 	blkno = SPGIST_METAPAGE_BLKNO + 1;
+	prefetch_blkno = blkno;
+
 	for (;;)
 	{
 		/* Get the current relation length */

From d78ad7684a343338660160e21e448f72afb94937 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Dec 2022 17:44:13 +0200
Subject: [PATCH 186/214] Update heap pge LSN in case of VM changes even if
 wal_redo_hints=off (#251)

refer #2807
---
 src/backend/access/heap/visibilitymap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 2e11f3fc0e4..c5eafec0746 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -289,7 +289,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 				 * If data checksums are enabled (or wal_log_hints=on), we
 				 * need to protect the heap page from being torn.
 				 */
+				/* NEON: we have to update page LSN even if  wal_log_hints=off 
 				if (XLogHintBitIsNeeded())
+				*/
 				{
 					Page		heapPage = BufferGetPage(heapBuf);
 

From 1a15868a5ea9bee20a45a5f3e0baaefef57a89b1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 12 Jan 2023 15:12:08 +0200
Subject: [PATCH 187/214] Show prefetch statistic in EXPLAIN (#248)

* Show prefetch statistic in EXPLAIN

refer #2994

* Collect per-node prefetch statistics

* Show number of prefetch duplicates in explain
---
 src/backend/commands/explain.c    | 38 +++++++++++++++++++++++++++++--
 src/backend/executor/instrument.c |  8 +++++++
 src/include/commands/explain.h    |  1 +
 src/include/executor/instrument.h |  9 ++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 70551522dac..7bc1da96226 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -47,7 +47,6 @@ ExplainOneQuery_hook_type ExplainOneQuery_hook = NULL;
 /* Hook for plugins to get control in explain_get_index_name() */
 explain_get_index_name_hook_type explain_get_index_name_hook = NULL;
 
-
 /* OR-able flags for ExplainXMLTag() */
 #define X_OPENING 0
 #define X_CLOSING 1
@@ -121,6 +120,7 @@ static void show_eval_params(Bitmapset *bms_params, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static void show_buffer_usage(ExplainState *es, const BufferUsage *usage,
 							  bool planning);
+static void show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info);
 static void show_wal_usage(ExplainState *es, const WalUsage *usage);
 static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
 									ExplainState *es);
@@ -186,6 +186,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
 			es->costs = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "buffers") == 0)
 			es->buffers = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "prefetch") == 0)
+			es->prefetch = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "wal") == 0)
 			es->wal = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "settings") == 0)
@@ -534,7 +536,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 	else if (es->analyze)
 		instrument_option |= INSTRUMENT_ROWS;
 
-	if (es->buffers)
+	if (es->buffers || es->prefetch)
 		instrument_option |= INSTRUMENT_BUFFERS;
 	if (es->wal)
 		instrument_option |= INSTRUMENT_WAL;
@@ -2055,6 +2057,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	if (es->wal && planstate->instrument)
 		show_wal_usage(es, &planstate->instrument->walusage);
 
+	/* Show prefetch usage */
+	if (es->prefetch && planstate->instrument)
+		show_prefetch_info(es, &planstate->instrument->bufusage.prefetch);
+
 	/* Prepare per-worker buffer/WAL usage */
 	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
 	{
@@ -3490,6 +3496,34 @@ explain_get_index_name(Oid indexId)
 	return result;
 }
 
+/*
+ * Show prefetch statistics
+ */
+static void
+show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info)
+{
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n",
+							 (long long) prefetch_info->hits,
+							 (long long) prefetch_info->misses,
+							 (long long) prefetch_info->expired,
+							 (long long) prefetch_info->duplicates);
+	}
+	else
+	{
+		ExplainPropertyInteger("Prefetch Hits", NULL,
+							   prefetch_info->hits, es);
+		ExplainPropertyInteger("Prefetch Misses", NULL,
+							   prefetch_info->misses, es);
+		ExplainPropertyInteger("Prefetch Expired Requests", NULL,
+							   prefetch_info->expired, es);
+		ExplainPropertyInteger("Prefetch Duplicated Requests", NULL,
+							   prefetch_info->duplicates, es);
+	}
+}
+
 /*
  * Show buffer usage details.
  */
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 2b106d8473c..55cb22250be 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -235,6 +235,10 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->local_blks_written += add->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written;
+	dst->prefetch.hits += add->prefetch.hits;
+	dst->prefetch.misses += add->prefetch.misses;
+	dst->prefetch.expired += add->prefetch.expired;
+	dst->prefetch.duplicates += add->prefetch.duplicates;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
 }
@@ -255,6 +259,10 @@ BufferUsageAccumDiff(BufferUsage *dst,
 	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
+	dst->prefetch.hits += add->prefetch.hits - sub->prefetch.hits;
+	dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses;
+	dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired;
+	dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates;
 	INSTR_TIME_ACCUM_DIFF(dst->blk_read_time,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index e94d9e49cf6..960dbf33dd8 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -46,6 +46,7 @@ typedef struct ExplainState
 	bool		timing;			/* print detailed node timing */
 	bool		summary;		/* print total planning and execution timing */
 	bool		settings;		/* print modified settings */
+	bool		prefetch;		/* print prefetch statistic */
 	ExplainFormat format;		/* output format */
 	/* state for output formatting --- not reset for each new plan tree */
 	int			indent;			/* current indentation level */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 2f9905b7c8e..602c7347a6c 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -15,6 +15,14 @@
 
 #include "portability/instr_time.h"
 
+/* Prefeth statistics */
+typedef struct
+{
+	int64 hits;
+	int64 misses;
+	int64 expired;
+	int64 duplicates;
+} PrefetchInfo;
 
 /*
  * BufferUsage and WalUsage counters keep being incremented infinitely,
@@ -35,6 +43,7 @@ typedef struct BufferUsage
 	int64		temp_blks_written;	/* # of temp blocks written */
 	instr_time	blk_read_time;	/* time spent reading */
 	instr_time	blk_write_time; /* time spent writing */
+	PrefetchInfo prefetch; /* prefetch statistics */ 
 } BufferUsage;
 
 /*

From 1a3341eb6baa116f6852167a2ae9d29a3ccb8a2d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 14 Feb 2023 19:00:12 +0200
Subject: [PATCH 188/214] Implement efficient prefetch for parallel bitmap heap
 scan (#258)

* Implement efficient prefetch for parallel bitmap heap scan

* Change MAX_IO_CONCURRENCY to be power of 2
---
 src/backend/executor/nodeBitmapHeapscan.c | 194 +++++++---------------
 src/include/nodes/execnodes.h             |  16 +-
 src/include/storage/bufmgr.h              |   4 +-
 3 files changed, 80 insertions(+), 134 deletions(-)

diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 2db1914affb..8cf0ede3797 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -150,6 +150,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				 */
 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
 #ifdef USE_PREFETCH
+				node->n_prefetch_requests = 0;
+				node->prefetch_request_pos = 0;
 				if (node->prefetch_maximum > 0)
 				{
 					pstate->prefetch_iterator =
@@ -173,13 +175,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
 			node->tbmres = tbmres = NULL;
 
-#ifdef USE_PREFETCH
-			if (node->prefetch_maximum > 0)
-			{
-				node->shared_prefetch_iterator =
-					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
-			}
-#endif							/* USE_PREFETCH */
 		}
 		node->initialized = true;
 	}
@@ -198,15 +193,24 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			if (!pstate)
 				node->tbmres = tbmres = tbm_iterate(tbmiterator);
 			else
-				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
+			{
+				if (node->n_prefetch_requests != 0)
+				{
+					node->tbmres = tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_request_pos];
+					node->n_prefetch_requests -= 1;
+					node->prefetch_request_pos = (node->prefetch_request_pos + 1) % MAX_IO_CONCURRENCY;
+					if (node->prefetch_pages != 0)
+						node->prefetch_pages -= 1;
+				}
+				else
+					node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
+			}
 			if (tbmres == NULL)
 			{
 				/* no more entries in the bitmap */
 				break;
 			}
 
-			BitmapAdjustPrefetchIterator(node, tbmres);
-
 			/*
 			 * We can skip fetching the heap page if we don't need any fields
 			 * from the heap, and the bitmap entries don't need rechecking,
@@ -361,54 +365,21 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
 							 TBMIterateResult *tbmres)
 {
 #ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
+	TBMIterator *prefetch_iterator = node->prefetch_iterator;
+	Assert(node->pstate == NULL);
 
-	if (pstate == NULL)
+	if (node->prefetch_pages > 0)
 	{
-		TBMIterator *prefetch_iterator = node->prefetch_iterator;
-
-		if (node->prefetch_pages > 0)
-		{
-			/* The main iterator has closed the distance by one page */
-			node->prefetch_pages--;
-		}
-		else if (prefetch_iterator)
-		{
-			/* Do not let the prefetch iterator get behind the main one */
-			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
-
-			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
-				elog(ERROR, "prefetch and main iterators are out of sync");
-		}
-		return;
+		/* The main iterator has closed the distance by one page */
+		node->prefetch_pages--;
 	}
-
-	if (node->prefetch_maximum > 0)
+	else if (prefetch_iterator)
 	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+		/* Do not let the prefetch iterator get behind the main one */
+		TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
 
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_pages > 0)
-		{
-			pstate->prefetch_pages--;
-			SpinLockRelease(&pstate->mutex);
-		}
-		else
-		{
-			/* Release the mutex before iterating */
-			SpinLockRelease(&pstate->mutex);
-
-			/*
-			 * In case of shared mode, we can not ensure that the current
-			 * blockno of the main iterator and that of the prefetch iterator
-			 * are same.  It's possible that whatever blockno we are
-			 * prefetching will be processed by another process.  Therefore,
-			 * we don't validate the blockno here as we do in non-parallel
-			 * case.
-			 */
-			if (prefetch_iterator)
-				tbm_shared_iterate(prefetch_iterator);
-		}
+		if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
+			elog(ERROR, "prefetch and main iterators are out of sync");
 	}
 #endif							/* USE_PREFETCH */
 }
@@ -425,35 +396,14 @@ static inline void
 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		if (node->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (node->prefetch_target >= node->prefetch_maximum / 2)
-			node->prefetch_target = node->prefetch_maximum;
-		else if (node->prefetch_target > 0)
-			node->prefetch_target *= 2;
-		else
-			node->prefetch_target++;
-		return;
-	}
-
-	/* Do an unlocked check first to save spinlock acquisitions. */
-	if (pstate->prefetch_target < node->prefetch_maximum)
-	{
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
-			pstate->prefetch_target = node->prefetch_maximum;
-		else if (pstate->prefetch_target > 0)
-			pstate->prefetch_target *= 2;
-		else
-			pstate->prefetch_target++;
-		SpinLockRelease(&pstate->mutex);
-	}
+	if (node->prefetch_target >= node->prefetch_maximum)
+		/* don't increase any further */ ;
+	else if (node->prefetch_target >= node->prefetch_maximum / 2)
+		node->prefetch_target = node->prefetch_maximum;
+	else if (node->prefetch_target > 0)
+		node->prefetch_target *= 2;
+	else
+		node->prefetch_target++;
 #endif							/* USE_PREFETCH */
 }
 
@@ -507,56 +457,46 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
 			}
 		}
-
-		return;
 	}
-
-	if (pstate->prefetch_pages < pstate->prefetch_target)
+	else
 	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
-
-		if (prefetch_iterator)
+		while (1)
 		{
-			while (1)
-			{
-				TBMIterateResult *tbmpre;
-				bool		do_prefetch = false;
-				bool		skip_fetch;
+			TBMIterateResult *tbmpre;
+			bool		do_prefetch = false;
+			bool		skip_fetch;
 
-				/*
-				 * Recheck under the mutex. If some other process has already
-				 * done enough prefetching then we need not to do anything.
-				 */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_pages < pstate->prefetch_target)
-				{
-					pstate->prefetch_pages++;
-					do_prefetch = true;
-				}
-				SpinLockRelease(&pstate->mutex);
+			if (node->prefetch_pages < node->prefetch_target)
+			{
+				Assert(node->n_prefetch_requests < MAX_IO_CONCURRENCY);
+				node->prefetch_pages++;
+				do_prefetch = true;
+			}
 
-				if (!do_prefetch)
-					return;
+			if (!do_prefetch)
+				return;
 
-				tbmpre = tbm_shared_iterate(prefetch_iterator);
-				if (tbmpre == NULL)
-				{
-					/* No more pages to prefetch */
-					tbm_end_shared_iterate(prefetch_iterator);
-					node->shared_prefetch_iterator = NULL;
-					break;
-				}
+			tbmpre = tbm_shared_iterate(node->shared_tbmiterator);
+			if (tbmpre != NULL)
+			{
+				memcpy(&node->prefetch_requests[(node->prefetch_request_pos + node->n_prefetch_requests) % MAX_IO_CONCURRENCY], tbmpre, sizeof(TBMIteratePrefetchResult));
+				node->n_prefetch_requests += 1;
+			}
+			else
+			{
+				/* No more pages to prefetch */
+				break;
+			}
 
-				/* As above, skip prefetch if we expect not to need page */
-				skip_fetch = (node->can_skip_fetch &&
-							  (node->tbmres ? !node->tbmres->recheck : false) &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre->blockno,
-											 &node->pvmbuffer));
+			/* As above, skip prefetch if we expect not to need page */
+			skip_fetch = (node->can_skip_fetch &&
+						  (node->tbmres ? !node->tbmres->recheck : false) &&
+						  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+										 tbmpre->blockno,
+										 &node->pvmbuffer));
 
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
-			}
+			if (!skip_fetch)
+				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
 		}
 	}
 #endif							/* USE_PREFETCH */
@@ -613,8 +553,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 		tbm_end_iterate(node->prefetch_iterator);
 	if (node->shared_tbmiterator)
 		tbm_end_shared_iterate(node->shared_tbmiterator);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
 	if (node->vmbuffer != InvalidBuffer)
@@ -627,7 +565,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 	node->prefetch_iterator = NULL;
 	node->initialized = false;
 	node->shared_tbmiterator = NULL;
-	node->shared_prefetch_iterator = NULL;
 	node->vmbuffer = InvalidBuffer;
 	node->pvmbuffer = InvalidBuffer;
 
@@ -683,8 +620,6 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 		tbm_free(node->tbm);
 	if (node->shared_tbmiterator)
 		tbm_end_shared_iterate(node->shared_tbmiterator);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->vmbuffer != InvalidBuffer)
 		ReleaseBuffer(node->vmbuffer);
 	if (node->pvmbuffer != InvalidBuffer)
@@ -739,7 +674,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->pscan_len = 0;
 	scanstate->initialized = false;
 	scanstate->shared_tbmiterator = NULL;
-	scanstate->shared_prefetch_iterator = NULL;
 	scanstate->pstate = NULL;
 
 	/*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index c8286cc4f38..44e0a153440 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -23,6 +23,7 @@
 #include "nodes/plannodes.h"
 #include "nodes/tidbitmap.h"
 #include "partitioning/partdefs.h"
+#include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "utils/hsearch.h"
 #include "utils/queryenvironment.h"
@@ -1644,6 +1645,15 @@ typedef struct ParallelBitmapHeapState
 	char		phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
 } ParallelBitmapHeapState;
 
+typedef struct TBMIteratePrefetchResult
+{
+	BlockNumber blockno;		/* page number containing tuples */
+	int			ntuples;		/* -1 indicates lossy result */
+	bool		recheck;		/* should the tuples be rechecked? */
+	/* Note: recheck is always true if ntuples < 0 */
+	OffsetNumber offsets[MaxHeapTuplesPerPage];
+} TBMIteratePrefetchResult;
+
 /* ----------------
  *	 BitmapHeapScanState information
  *
@@ -1664,7 +1674,6 @@ typedef struct ParallelBitmapHeapState
  *		pscan_len		   size of the shared memory for parallel bitmap
  *		initialized		   is node is ready to iterate
  *		shared_tbmiterator	   shared iterator
- *		shared_prefetch_iterator shared iterator for prefetching
  *		pstate			   shared state for parallel bitmap scan
  * ----------------
  */
@@ -1688,7 +1697,10 @@ typedef struct BitmapHeapScanState
 	Size		pscan_len;
 	bool		initialized;
 	TBMSharedIterator *shared_tbmiterator;
-	TBMSharedIterator *shared_prefetch_iterator;
+	/* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */
+	TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY];
+	int n_prefetch_requests; /* number of used elements in prefetch_requests ring buffer */
+	int prefetch_request_pos; /* head position in ring buffer */
 	ParallelBitmapHeapState *pstate;
 } BitmapHeapScanState;
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 6d140786c74..2b223ead345 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -86,8 +86,8 @@ extern PGDLLIMPORT int NLocBuffer;
 extern PGDLLIMPORT Block *LocalBufferBlockPointers;
 extern PGDLLIMPORT int32 *LocalRefCount;
 
-/* upper limit for effective_io_concurrency */
-#define MAX_IO_CONCURRENCY 1000
+/* upper limit for effective_io_concurrency (better to he power of 2) */
+#define MAX_IO_CONCURRENCY 1024
 
 /* special block number for ReadBuffer() */
 #define P_NEW	InvalidBlockNumber	/* grow the file to get a new page */

From 3ab16be781caac750d685b69d82be3e64cc2765a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 22 Feb 2023 17:37:44 +0200
Subject: [PATCH 189/214] Unlogged index fix v14 (#259)

* Avoid errors when accessing indexes of unlogge tables after compute restart

* Address review complaints: add comment to mdopenfork

* Initialize unlogged index undex eclusive lock
---
 src/backend/optimizer/util/plancat.c | 38 +++++++++++++++++++++++++++-
 src/backend/storage/smgr/md.c        | 25 ++++++++++++++++--
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index c1d5a2fef10..4adb3f0ba20 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -27,6 +27,7 @@
 #include "access/xlog.h"
 #include "catalog/catalog.h"
 #include "catalog/heap.h"
+#include "catalog/index.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_statistic_ext.h"
@@ -46,6 +47,8 @@
 #include "rewrite/rewriteManip.h"
 #include "statistics/statistics.h"
 #include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/partcache.h"
@@ -80,6 +83,39 @@ static void set_baserel_partition_key_exprs(Relation relation,
 static void set_baserel_partition_constraint(Relation relation,
 											 RelOptInfo *rel);
 
+static bool
+is_index_valid(Relation index, LOCKMODE lmode)
+{
+	if (!index->rd_index->indisvalid)
+		return false;
+
+	if (index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
+	{
+		while (true)
+		{
+			Buffer metapage = ReadBuffer(index, 0);
+			bool isNew = PageIsNew(BufferGetPage(metapage));
+			ReleaseBuffer(metapage);
+			if (isNew)
+			{
+				Relation heap;
+				if (lmode != ExclusiveLock)
+				{
+					UnlockRelation(index, lmode);
+					LockRelation(index, ExclusiveLock);
+					lmode = ExclusiveLock;
+					continue;
+				}
+				DropRelFileNodesAllBuffers(&index->rd_smgr, 1);
+				heap = RelationIdGetRelation(index->rd_index->indrelid);
+				index->rd_indam->ambuild(heap, index, BuildIndexInfo(index));
+				RelationClose(heap);
+			}
+			break;
+		}
+	}
+	return true;
+}
 
 /*
  * get_relation_info -
@@ -221,7 +257,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			 * still needs to insert into "invalid" indexes, if they're marked
 			 * indisready.
 			 */
-			if (!index->indisvalid)
+			if (!is_index_valid(indexRelation, lmode))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index d843b9c207c..058ed503ffe 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -493,6 +493,13 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 
 	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
 
+	/*
+	 * NEON: unlogged relation files are lost after compute restart - we need to implicitly recreate them
+	 * to allow data insertion
+	 */
+	if (fd < 0 && (behavior & EXTENSION_CREATE))
+		fd = PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY);
+
 	if (fd < 0)
 	{
 		if ((behavior & EXTENSION_RETURN_NULL) &&
@@ -655,9 +662,23 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 										reln->smgr_rnode.node.relNode,
 										reln->smgr_rnode.backend);
 
+	/* NEON: md smgr is used in Neon for unlogged and temp relations.
+	 * After compute node restart their data is deleted but unlogged tables are still present in system catalog.
+	 * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination.
+	 * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL
+	 */
 	v = _mdfd_getseg(reln, forknum, blocknum, false,
-					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
-
+					 RelFileNodeBackendIsTemp(reln->smgr_rnode)
+					 ? EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY
+					 : EXTENSION_RETURN_NULL);
+	if (v == NULL)
+	{
+		char* path = relpath(reln->smgr_rnode, forknum);
+		(void)PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY);
+		pfree(path);
+		MemSet(buffer, 0, BLCKSZ);
+		return;
+	}
 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);

From cf9e36cc4b7faaf1aa855e007feb411c72a5ab59 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 23 Feb 2023 15:21:01 +0200
Subject: [PATCH 190/214] Fix bitmap scan prefetch (#260)

---
 src/backend/executor/nodeBitmapHeapscan.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 8cf0ede3797..9dcc07122d2 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -211,6 +211,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				break;
 			}
 
+			BitmapAdjustPrefetchIterator(node, tbmres);
+
 			/*
 			 * We can skip fetching the heap page if we don't need any fields
 			 * from the heap, and the bitmap entries don't need rechecking,
@@ -366,7 +368,10 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
 {
 #ifdef USE_PREFETCH
 	TBMIterator *prefetch_iterator = node->prefetch_iterator;
-	Assert(node->pstate == NULL);
+
+	/* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */
+	if (node->pstate != NULL)
+		return;
 
 	if (node->prefetch_pages > 0)
 	{
@@ -396,6 +401,10 @@ static inline void
 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
+	/* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */
+	if (node->pstate != NULL)
+		return;
+
 	if (node->prefetch_target >= node->prefetch_maximum)
 		/* don't increase any further */ ;
 	else if (node->prefetch_target >= node->prefetch_maximum / 2)

From 338f1377f748a08ebb1335cd15680656a1228f7a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 24 Feb 2023 18:18:45 +0400
Subject: [PATCH 191/214] Revert handling of UNLOGGED tables on compute side.

They will be handled in pageserver, ref https://github.com/neondatabase/neon/pull/3706

Reverts a9f503479d587fc3f3067e994acd5aae82151ec7
Reverts 7d7a5479084a5d09c271fc79e654c0d50a914e7f
---
 src/backend/optimizer/util/plancat.c | 38 +---------------------------
 src/backend/storage/smgr/md.c        | 33 +++---------------------
 2 files changed, 4 insertions(+), 67 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 4adb3f0ba20..c1d5a2fef10 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -27,7 +27,6 @@
 #include "access/xlog.h"
 #include "catalog/catalog.h"
 #include "catalog/heap.h"
-#include "catalog/index.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_statistic_ext.h"
@@ -47,8 +46,6 @@
 #include "rewrite/rewriteManip.h"
 #include "statistics/statistics.h"
 #include "storage/bufmgr.h"
-#include "storage/buf_internals.h"
-#include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/partcache.h"
@@ -83,39 +80,6 @@ static void set_baserel_partition_key_exprs(Relation relation,
 static void set_baserel_partition_constraint(Relation relation,
 											 RelOptInfo *rel);
 
-static bool
-is_index_valid(Relation index, LOCKMODE lmode)
-{
-	if (!index->rd_index->indisvalid)
-		return false;
-
-	if (index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
-	{
-		while (true)
-		{
-			Buffer metapage = ReadBuffer(index, 0);
-			bool isNew = PageIsNew(BufferGetPage(metapage));
-			ReleaseBuffer(metapage);
-			if (isNew)
-			{
-				Relation heap;
-				if (lmode != ExclusiveLock)
-				{
-					UnlockRelation(index, lmode);
-					LockRelation(index, ExclusiveLock);
-					lmode = ExclusiveLock;
-					continue;
-				}
-				DropRelFileNodesAllBuffers(&index->rd_smgr, 1);
-				heap = RelationIdGetRelation(index->rd_index->indrelid);
-				index->rd_indam->ambuild(heap, index, BuildIndexInfo(index));
-				RelationClose(heap);
-			}
-			break;
-		}
-	}
-	return true;
-}
 
 /*
  * get_relation_info -
@@ -257,7 +221,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			 * still needs to insert into "invalid" indexes, if they're marked
 			 * indisready.
 			 */
-			if (!is_index_valid(indexRelation, lmode))
+			if (!index->indisvalid)
 			{
 				index_close(indexRelation, NoLock);
 				continue;
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 058ed503ffe..c7f2c647851 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -493,13 +493,6 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 
 	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
 
-	/*
-	 * NEON: unlogged relation files are lost after compute restart - we need to implicitly recreate them
-	 * to allow data insertion
-	 */
-	if (fd < 0 && (behavior & EXTENSION_CREATE))
-		fd = PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY);
-
 	if (fd < 0)
 	{
 		if ((behavior & EXTENSION_RETURN_NULL) &&
@@ -662,23 +655,9 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 										reln->smgr_rnode.node.relNode,
 										reln->smgr_rnode.backend);
 
-	/* NEON: md smgr is used in Neon for unlogged and temp relations.
-	 * After compute node restart their data is deleted but unlogged tables are still present in system catalog.
-	 * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination.
-	 * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL
-	 */
 	v = _mdfd_getseg(reln, forknum, blocknum, false,
-					 RelFileNodeBackendIsTemp(reln->smgr_rnode)
-					 ? EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY
-					 : EXTENSION_RETURN_NULL);
-	if (v == NULL)
-	{
-		char* path = relpath(reln->smgr_rnode, forknum);
-		(void)PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY);
-		pfree(path);
-		MemSet(buffer, 0, BLCKSZ);
-		return;
-	}
+					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
@@ -799,13 +778,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber nblocks;
 	BlockNumber segno;
 
-	/* NEON: md smgr is used in Neon for unlogged and temp relations.
-	 * After compute node restart their data is deleted but unlogged tables are still present in system catalog.
-	 * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination.
-	 * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL
-	 */
-	if (!mdopenfork(reln, forknum, RelFileNodeBackendIsTemp(reln->smgr_rnode) ? EXTENSION_FAIL : EXTENSION_RETURN_NULL))
-		return 0;
+	mdopenfork(reln, forknum, EXTENSION_FAIL);
 
 	/* mdopen has opened the first segment */
 	Assert(reln->md_num_open_segs[forknum] > 0);

From 7c8fa6ccf8d181e88dc62660d7d56a0ea9649398 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 23 Feb 2023 23:43:25 +0200
Subject: [PATCH 192/214] Allow external main functions to skip config load and
 make last written LSN cache optional.

---
 src/backend/access/transam/xlog.c | 6 +++++-
 src/backend/main/main.c           | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 74f02abe923..dd093ec5b87 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5255,6 +5255,7 @@ XLOGShmemInit(void)
 	XLogCtl = (XLogCtlData *)
 		ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog);
 
+	if (lastWrittenLsnCacheSize > 0)
 	{
 		static HASHCTL info;
 		info.keysize = sizeof(BufferTag);
@@ -5264,6 +5265,7 @@ XLOGShmemInit(void)
 											&info,
 											HASH_ELEM | HASH_BLOBS);
 	}
+
 	localControlFile = ControlFile;
 	ControlFile = (ControlFileData *)
 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
@@ -8957,6 +8959,8 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
 	XLogRecPtr lsn;
 	LastWrittenLsnCacheEntry* entry;
 
+	Assert(lastWrittenLsnCacheSize != 0);
+
 	LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
 
 	/* Maximal last written LSN among all non-cached pages */
@@ -9001,7 +9005,7 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
 void
 SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
 {
-	if (lsn == InvalidXLogRecPtr || n_blocks == 0)
+	if (lsn == InvalidXLogRecPtr || n_blocks == 0 || lastWrittenLsnCacheSize == 0)
 		return;
 
 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 1d93111bd03..04f5d2a5c3a 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -55,7 +55,7 @@ static void check_root(const char *progname);
 typedef int (*MainFunc) (int argc, char *argv[]);
 
 static int
-CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[])
+CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[], bool load_config)
 {
 	MainFunc main_func;
 
@@ -72,7 +72,7 @@ CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[])
 	InitializeGUCOptions();
 
 	/* Acquire configuration parameters */
-	if (!SelectConfigFiles(NULL, progname))
+	if (load_config && !SelectConfigFiles(NULL, progname))
 		exit(1);
 
 	/*
@@ -242,9 +242,9 @@ main(int argc, char *argv[])
 					 NULL,		/* no dbname */
 					 strdup(get_user_name_or_exit(progname)));	/* does not return */
 	else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0)
-		CallExtMain("neon_walredo", "WalRedoMain", argc, argv);
+		CallExtMain("neon_walredo", "WalRedoMain", argc, argv, false);
 	else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0)
-		CallExtMain("neon", "WalProposerSync", argc, argv);
+		CallExtMain("neon", "WalProposerSync", argc, argv, true);
 	else
 		PostmasterMain(argc, argv); /* does not return */
 	abort();					/* should not get here */

From 841befc60e1da4521c5d090272865e1a01dfdef6 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Mon, 6 Mar 2023 18:56:19 +0200
Subject: [PATCH 193/214] Remove walredo-related hacks from
 InternalIpcMemoryCreate()

Now similar kind of hack (using malloc() instead of shmem) is
done in the wal-redo extension.
---
 src/backend/port/sysv_shmem.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 2dc4ec3d26e..09652171328 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -155,22 +155,6 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
 		}
 	}
 #endif
-
-	/*
-	 * NEON: do not create shared memory segments for single user wal redo
-	 * postgres. Many spawned instances of wal redo may exhaust kernel.shmmni
-	 */
-	if (am_wal_redo_postgres)
-	{
-		void	   *ptr = malloc(size);
-
-		if (ptr == NULL)
-		{
-			ereport(FATAL,
-					(errmsg("could not create shared memory segment with size %zu for WAL redo process", size)));
-		}
-		return ptr;
-	}
 	shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
 
 	if (shmid < 0)

From f156677d4ce43292919d749c481f2edfec3c0bb7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 27 Mar 2023 17:51:30 +0300
Subject: [PATCH 194/214] Adjust prefetch target for parallel bitmap scan
 (#273)

* Adjust prefetch target for parallel bitmap scan

* More fixes for parallel bitmap scan prefetch
---
 src/backend/executor/nodeBitmapHeapscan.c | 30 +++--------------------
 1 file changed, 4 insertions(+), 26 deletions(-)

diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 9dcc07122d2..828054895e0 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -154,15 +154,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				node->prefetch_request_pos = 0;
 				if (node->prefetch_maximum > 0)
 				{
-					pstate->prefetch_iterator =
-						tbm_prepare_shared_iterate(tbm);
-
-					/*
-					 * We don't need the mutex here as we haven't yet woke up
-					 * others.
-					 */
-					pstate->prefetch_pages = 0;
-					pstate->prefetch_target = -1;
+					node->prefetch_pages = 0;
+					node->prefetch_target = -1;
 				}
 #endif
 
@@ -264,19 +257,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			 * Try to prefetch at least a few pages even before we get to the
 			 * second page if we don't stop reading after the first tuple.
 			 */
-			if (!pstate)
-			{
-				if (node->prefetch_target < node->prefetch_maximum)
-					node->prefetch_target++;
-			}
-			else if (pstate->prefetch_target < node->prefetch_maximum)
-			{
-				/* take spinlock while updating shared state */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_target < node->prefetch_maximum)
-					pstate->prefetch_target++;
-				SpinLockRelease(&pstate->mutex);
-			}
+			if (node->prefetch_target < node->prefetch_maximum)
+				node->prefetch_target++;
 #endif							/* USE_PREFETCH */
 		}
 
@@ -401,10 +383,6 @@ static inline void
 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
-	/* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */
-	if (node->pstate != NULL)
-		return;
-
 	if (node->prefetch_target >= node->prefetch_maximum)
 		/* don't increase any further */ ;
 	else if (node->prefetch_target >= node->prefetch_maximum / 2)

From f2c0f39a3d57399ec95a298d1e5887ea2196b146 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 7 Apr 2023 08:45:37 +0300
Subject: [PATCH 195/214] Heap bitmap scan prefetch fix2 v14 (#275)

* Copy iterator result in BitmapHeapNext

* Restore initial -1 value for prefetch_target

* Add tbmres_copy to BitmapHeapScanState
---
 src/backend/executor/nodeBitmapHeapscan.c | 50 +++++++++++------------
 src/include/nodes/execnodes.h             |  4 +-
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 828054895e0..b47d2caa71b 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -149,19 +149,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				 * multiple processes to iterate jointly.
 				 */
 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
-#ifdef USE_PREFETCH
-				node->n_prefetch_requests = 0;
-				node->prefetch_request_pos = 0;
-				if (node->prefetch_maximum > 0)
-				{
-					node->prefetch_pages = 0;
-					node->prefetch_target = -1;
-				}
-#endif
 
 				/* We have initialized the shared state so wake up others. */
 				BitmapDoneInitializingSharedState(pstate);
 			}
+#ifdef USE_PREFETCH
+			node->prefetch_head = 0;
+			node->prefetch_pages = 0;
+			node->prefetch_target = -1;
+#endif
 
 			/* Allocate a private iterator and attach the shared state to it */
 			node->shared_tbmiterator = shared_tbmiterator =
@@ -184,20 +180,25 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		if (tbmres == NULL)
 		{
 			if (!pstate)
-				node->tbmres = tbmres = tbm_iterate(tbmiterator);
+				tbmres = tbm_iterate(tbmiterator);
 			else
 			{
-				if (node->n_prefetch_requests != 0)
+				if (node->prefetch_pages != 0)
 				{
-					node->tbmres = tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_request_pos];
-					node->n_prefetch_requests -= 1;
-					node->prefetch_request_pos = (node->prefetch_request_pos + 1) % MAX_IO_CONCURRENCY;
-					if (node->prefetch_pages != 0)
-						node->prefetch_pages -= 1;
+					tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_head];
+					node->prefetch_pages -= 1;
+					node->prefetch_head = (node->prefetch_head + 1) % MAX_IO_CONCURRENCY;
 				}
 				else
-					node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
+					tbmres = tbm_shared_iterate(shared_tbmiterator);
+				if (tbmres)
+				{
+					/* Need to copy result because iterator can be used for prefetch and vocant position in prefetch ring buffer can also be reused */
+					memcpy(&node->tbmres_copy, tbmres, offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmres->ntuples, 0));
+					tbmres = (TBMIterateResult *)&node->tbmres_copy;
+				}
 			}
+			node->tbmres = tbmres;
 			if (tbmres == NULL)
 			{
 				/* no more entries in the bitmap */
@@ -236,7 +237,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				/* AM doesn't think this block is valid, skip */
 				continue;
 			}
-
 			if (tbmres->ntuples >= 0)
 				node->exact_pages++;
 			else
@@ -455,8 +455,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 
 			if (node->prefetch_pages < node->prefetch_target)
 			{
-				Assert(node->n_prefetch_requests < MAX_IO_CONCURRENCY);
-				node->prefetch_pages++;
+				Assert(node->prefetch_pages < MAX_IO_CONCURRENCY);
 				do_prefetch = true;
 			}
 
@@ -466,8 +465,10 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 			tbmpre = tbm_shared_iterate(node->shared_tbmiterator);
 			if (tbmpre != NULL)
 			{
-				memcpy(&node->prefetch_requests[(node->prefetch_request_pos + node->n_prefetch_requests) % MAX_IO_CONCURRENCY], tbmpre, sizeof(TBMIteratePrefetchResult));
-				node->n_prefetch_requests += 1;
+				memcpy(&node->prefetch_requests[(node->prefetch_head + node->prefetch_pages) % MAX_IO_CONCURRENCY],
+					   tbmpre,
+					   offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmpre->ntuples, 0));
+				node->prefetch_pages += 1;
 			}
 			else
 			{
@@ -477,7 +478,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 
 			/* As above, skip prefetch if we expect not to need page */
 			skip_fetch = (node->can_skip_fetch &&
-						  (node->tbmres ? !node->tbmres->recheck : false) &&
+						  !tbmpre->recheck &&
 						  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
 										 tbmpre->blockno,
 										 &node->pvmbuffer));
@@ -715,8 +716,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	 * Maximum number of prefetches for the tablespace if configured,
 	 * otherwise the current value of the effective_io_concurrency GUC.
 	 */
-	scanstate->prefetch_maximum =
-		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
+	scanstate->prefetch_maximum = get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
 
 	scanstate->ss.ss_currentRelation = currentRelation;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 44e0a153440..0f4dbabfe38 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1699,8 +1699,8 @@ typedef struct BitmapHeapScanState
 	TBMSharedIterator *shared_tbmiterator;
 	/* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */
 	TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY];
-	int n_prefetch_requests; /* number of used elements in prefetch_requests ring buffer */
-	int prefetch_request_pos; /* head position in ring buffer */
+	TBMIteratePrefetchResult tbmres_copy; /* copy of current iterator result */
+	int prefetch_head; /* head position in ring buffer */
 	ParallelBitmapHeapState *pstate;
 } BitmapHeapScanState;
 

From 7d6adc5cd8aa220b2fa50a71f802d1eed63da0a6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 7 Apr 2023 08:49:02 +0300
Subject: [PATCH 196/214] Implement index prefetch for index and index-only
 scans (#277)

* Implement index prefetch for index and index-only scans

* Move prefetch_blocks array to the end of BTScanOpaqueData struct
---
 src/backend/access/nbtree/README       |  44 +++++
 src/backend/access/nbtree/nbtinsert.c  |   2 +-
 src/backend/access/nbtree/nbtree.c     |   1 +
 src/backend/access/nbtree/nbtsearch.c  | 214 ++++++++++++++++++++++++-
 src/backend/optimizer/path/costsize.c  |   2 +
 src/backend/utils/misc/guc.c           |  20 +++
 src/include/access/nbtree.h            |  17 ++
 src/include/optimizer/cost.h           |   3 +
 src/test/regress/expected/sysviews.out |   4 +-
 9 files changed, 301 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index bfe33b6b431..b0cfa62e651 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1054,3 +1054,47 @@ item is irrelevant, and need not be stored at all.  This arrangement
 corresponds to the fact that an L&Y non-leaf page has one more pointer
 than key.  Suffix truncation's negative infinity attributes behave in
 the same way.
+
+Notes About Index Scan Prefetch
+-------------------------------
+
+Prefetch can significantly improve the speed of OLAP queries.
+To be able to perform prefetch, we need to know which pages will
+be accessed during the scan. It is trivial for heap- and bitmap scans,
+but requires more effort for index scans: to implement prefetch for
+index scans, we need to find out subsequent leaf pages. 
+
+Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for
+forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages.
+
+Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page.
+
+We should prefetch not only leaf pages, but also the next parent page.
+The trick is to correctly calculate the moment when it will be needed:
+We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page.
+
+Currently there are two different prefetch implementations for
+index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches
+only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only
+if parallel plan is not used. Parallel index scan is using critical section for obtaining next
+page by parallel worker. Leaf page is loaded in this critical section.
+And if most of time is spent in loading the page, then it actually eliminates any concurrency
+and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in
+any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0.
+
+Prefetch for normal (not index-only) index tries to prefetch heap tuples
+referenced from leaf page. Average number of items per page
+is about 100 which is comparable with default value of effective_io_concurrency.
+So there is not so much sense trying to prefetch also next leaf page.
+
+As far as it is difficult to estimate number of entries traversed by index scan,
+we prefer not to prefetch  large number of pages from the very beginning.
+Such useless prefetch can reduce the performance of point lookups.
+Instead of it we start with smallest prefetch distance and increase it
+by INCREASE_PREFETCH_DISTANCE_STEP after processing each item
+until it reaches effective_io_concurrency. In case of index-only
+scan we increase prefetch distance after processing each leaf pages
+and for index scan - after processing each tuple.
+The only exception is case when no key bounds are specified.
+In this case we traverse the whole relation and it makes sense
+to start with the largest possible prefetch distance from the very beginning.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 1241c562397..26debe39a77 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -2159,7 +2159,7 @@ _bt_insert_parent(Relation rel,
 					 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 			/* Find the leftmost page at the next level up */
-			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL);
 			/* Set up a phony stack entry pointing there */
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index d6fef1490d5..65b66a8bcfa 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -368,6 +368,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
+	so->prefetch_maximum = 0;   /* disable prefetch */
 
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index eba47c2f0cf..bcf5dd368a6 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -18,12 +18,14 @@
 #include "access/nbtree.h"
 #include "access/relscan.h"
 #include "access/xact.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "storage/predicate.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
-
+#include "utils/spccache.h"
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
@@ -47,6 +49,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
+#define INCREASE_PREFETCH_DISTANCE_STEP 1
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -842,6 +845,70 @@ _bt_compare(Relation rel,
 	return 0;
 }
 
+
+/*
+ * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch.
+ * This functions returns offset of first item.
+ */
+static int
+_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir)
+{
+	Relation rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	OffsetNumber n_child;
+	int          next_parent_prefetch_index;
+	int          i, j;
+
+	buf = _bt_getbuf(rel, parent, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	offnum = P_FIRSTDATAKEY(opaque);
+	n_child = PageGetMaxOffsetNumber(page) - offnum + 1;
+
+	/* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance,
+	 * assuming that it will reach prefetch_maximum before we reach and of the parent page
+	 */
+	next_parent_prefetch_index = (n_child > so->prefetch_maximum)
+		? n_child - so->prefetch_maximum : 0;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		so->next_parent = opaque->btpo_next;
+		if (so->next_parent == P_NONE)
+			next_parent_prefetch_index = -1;
+		for (i = 0, j = 0; i < n_child; i++)
+		{
+			ItemId itemid = PageGetItemId(page, offnum + i);
+			IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+			if (i == next_parent_prefetch_index)
+				so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */
+ 			so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup);
+		}
+	}
+	else
+	{
+		so->next_parent = opaque->btpo_prev;
+		if (so->next_parent == P_NONE)
+			next_parent_prefetch_index = -1;
+		for (i = 0, j = 0; i < n_child; i++)
+		{
+			ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1);
+			IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+			if (i == next_parent_prefetch_index)
+				so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */
+			so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup);
+		}
+	}
+	so->n_prefetch_blocks = j;
+	so->last_prefetch_index = 0;
+	_bt_relbuf(rel, buf);
+	return offnum;
+}
+
 /*
  *	_bt_first() -- Find the first item in a scan.
  *
@@ -1101,6 +1168,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
+	/* Neon: initialize prefetch */
+	so->n_prefetch_requests = 0;
+	so->n_prefetch_blocks = 0;
+	so->last_prefetch_index = 0;
+	so->next_parent = P_NONE;
+	so->prefetch_maximum = IsCatalogRelation(rel)
+		? effective_io_concurrency
+		: get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
+
+	if (scan->xs_want_itup) /* index only scan */
+	{
+		if (enable_indexonlyscan_prefetch)
+		{
+			/* We disable prefetch for parallel index-only scan.
+			 * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker
+			 * which issued prefetch request. The logic of splitting pages between parallel workers in
+			 * index scan doesn't allow to satisfy this requirement.
+			 * Also prefetch of leave pages will be useless if expected number of rows fits in one page.
+			 */
+			if (scan->parallel_scan)
+				so->prefetch_maximum = 0;  /* disable prefetch */
+		}
+		else
+			so->prefetch_maximum = 0; /* disable prefetch */
+	}
+	else if (!enable_indexscan_prefetch || !scan->heapRelation)
+		so->prefetch_maximum = 0; /* disable prefetch */
+
+	/* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */
+	so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0;
+
 	/*
 	 * If we found no usable boundary keys, we have to start from one end of
 	 * the tree.  Walk down that edge to the first or last key, and scan from
@@ -1371,6 +1469,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
 
+	/* Start prefetching for index only scan */
+	if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */
+	{
+		int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir);
+		int skip = ScanDirectionIsForward(dir)
+			? stack->bts_offset - first_offset
+			: first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset;
+		Assert(so->n_prefetch_blocks >= skip);
+		so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP;
+		so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip);
+		so->last_prefetch_index = skip + so->n_prefetch_requests;
+		for (int i = skip; i < so->last_prefetch_index; i++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]);
+	}
+
 	/* don't need to keep the stack around... */
 	_bt_freestack(stack);
 
@@ -1510,9 +1623,63 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
 	scan->xs_heaptid = currItem->heapTid;
-	if (scan->xs_want_itup)
+	if (scan->xs_want_itup) /* index-only scan */
+	{
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+	}
+	else if (so->prefetch_maximum > 0)
+	{
+		int prefetchLimit, prefetchDistance;
+
+		/* Neon: prefetch referenced heap pages.
+		 * As far as it is difficult to predict how much items index scan will return
+		 * we do not want to prefetch many heap pages from the very beginning because
+		 * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP
+		 * at each index scan iteration until it reaches prefetch_maximum.
+		 */
+
+		/* Advance pefetch distance until it reaches prefetch_maximum */
+		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
+			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
+		else
+			so->current_prefetch_distance = so->prefetch_maximum;
+
+		/* How much we can prefetch */
+		prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1);
 
+		/* Active prefeth requests */
+		prefetchDistance = so->n_prefetch_requests;
+
+		/*
+		 * Consume one prefetch request (if any)
+		 */
+		if (prefetchDistance != 0)
+			prefetchDistance -= 1;
+
+		/* Keep number of active prefetch requests equal to the current prefetch distance.
+		 * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration,
+		 * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations
+		 */
+		if (ScanDirectionIsForward(dir))
+		{
+			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem)
+			{
+				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid);
+				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
+				prefetchDistance += 1;
+			}
+		}
+		else
+		{
+			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem)
+			{
+				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid);
+				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
+				prefetchDistance += 1;
+			}
+		}
+		so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */
+	}
 	return true;
 }
 
@@ -1919,6 +2086,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		so->markItemIndex = -1;
 	}
 
+	if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leave pages for index-only scan */
+	{
+		/* Advance pefetch distance until it reaches prefetch_maximum */
+		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
+			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
+
+		so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */
+
+		/* Check if the are more children to prefetch at current parent  page */
+		if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE)
+		{
+			/* we have prefetched all items from current parent page, let's move to the next parent page */
+			_bt_read_parent_for_prefetch(scan, so->next_parent, dir);
+			so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */
+		}
+
+		/* Try to keep number of active prefetch requests equal to current prefetch distance */
+		while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks)
+		{
+			so->n_prefetch_requests += 1;
+			PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]);
+		}
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* Walk right to the next page with data */
@@ -2323,6 +2514,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
  */
 Buffer
 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+				 BlockNumber* parent,
 				 Snapshot snapshot)
 {
 	Buffer		buf;
@@ -2331,6 +2523,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 	OffsetNumber offnum;
 	BlockNumber blkno;
 	IndexTuple	itup;
+	BlockNumber parent_blocknum = P_NONE;
 
 	/*
 	 * If we are looking for a leaf page, okay to descend from fast root;
@@ -2348,6 +2541,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 	page = BufferGetPage(buf);
 	TestForOldSnapshot(snapshot, rel, page);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	blkno = BufferGetBlockNumber(buf);
 
 	for (;;)
 	{
@@ -2386,12 +2580,15 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 			offnum = P_FIRSTDATAKEY(opaque);
 
 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		parent_blocknum = blkno;
 		blkno = BTreeTupleGetDownLink(itup);
 
 		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	}
+	if (parent)
+		*parent = parent_blocknum;
 
 	return buf;
 }
@@ -2415,13 +2612,13 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	BTPageOpaque opaque;
 	OffsetNumber start;
 	BTScanPosItem *currItem;
-
+	BlockNumber	parent;
 	/*
 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
 	 * version of _bt_search().  We don't maintain a stack since we know we
 	 * won't need it.
 	 */
-	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
+	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent, scan->xs_snapshot);
 
 	if (!BufferIsValid(buf))
 	{
@@ -2434,6 +2631,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		return false;
 	}
 
+	/* Start prefetching for index-only scan */
+	if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */
+	{
+		_bt_read_parent_for_prefetch(scan, parent, dir);
+		so->n_prefetch_requests = so->last_prefetch_index = Min(so->prefetch_maximum, so->n_prefetch_blocks);
+		for (int i = 0; i < so->last_prefetch_index; i++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]);
+	}
+
 	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index ba539cc15b5..cd56c54f781 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -151,6 +151,8 @@ bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
 bool		enable_async_append = true;
 bool        enable_seqscan_prefetch = true;
+bool        enable_indexscan_prefetch = true;
+bool        enable_indexonlyscan_prefetch = true;
 
 typedef struct
 {
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c6a054d6c73..150061e1ee0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -965,6 +965,26 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Enables prefetching of heap pages in index scans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexscan_prefetch,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Enables prefetching of leave pages in index-only scans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexonlyscan_prefetch,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of sequential-scan plans."),
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 3d9192fbff1..200a4a31d97 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1069,6 +1069,22 @@ typedef struct BTScanOpaqueData
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
+
+	/* Neon: prefetch state */
+	int         prefetch_maximum; /* maximal number of prefetch requests */
+
+	/* Prefech of referenced heap pages for index scan */
+	/* To minimize waste prefetch requests we start with prefetch distance 0
+	 * and increase it until it reaches prefetch_maximum
+	 */
+	int         current_prefetch_distance;
+
+	/* Prefetch of leave pages of B-Tree for index-only scan */
+	int         n_prefetch_requests; /* number of active prefetch requests */
+	int         n_prefetch_blocks; /* number of elements in prefetch_blocks */
+	int         last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */
+	BlockNumber next_parent; /* pointer to next parent page */
+	BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -1232,6 +1248,7 @@ extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+							   BlockNumber* parent,
 							   Snapshot snapshot);
 
 /*
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 8639b59687c..96b28059f33 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -68,6 +68,9 @@ extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
 extern PGDLLIMPORT bool enable_async_append;
 extern PGDLLIMPORT bool enable_seqscan_prefetch;
+extern PGDLLIMPORT bool enable_indexscan_prefetch;
+extern PGDLLIMPORT bool enable_indexonlyscan_prefetch;
+
 extern PGDLLIMPORT int constraint_exclusion;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 4453fcaa517..5179380b06d 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -109,7 +109,9 @@ select name, setting from pg_settings where name like 'enable%';
  enable_hashjoin                | on
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
+ enable_indexonlyscan_prefetch  | on
  enable_indexscan               | on
+ enable_indexscan_prefetch      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -123,7 +125,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan_prefetch        | on
  enable_sort                    | on
  enable_tidscan                 | on
-(21 rows)
+(23 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 31deb69e14790bfd488222c32e41fa9747cefa7c Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 13 Apr 2023 22:42:27 +0200
Subject: [PATCH 197/214] [PG14] Feature/replicas (#278)

* Recovery requirements:

Add condition variable for WAL recovery; allowing backends to wait for recovery up to some record pointer.

* Fix issues w.r.t. WAL when LwLsn is initiated and when recovery starts.
This fixes some test failures that showed up after updating Neon code to do
more precise handling of replica's get_page_at_lsn's request_lsn lsns.

---------

Co-authored-by: Matthias van de Meent <boekewurm+postgres@gmail.com>
---
 src/backend/access/transam/xlog.c | 80 +++++++++++++++++++++++++++++--
 src/include/access/xlog.h         |  1 +
 src/include/access/xlogutils.h    |  4 ++
 3 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index dd093ec5b87..b4da9c40c44 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -754,6 +754,7 @@ typedef struct XLogCtlData
 	TimeLineID	lastReplayedTLI;
 	XLogRecPtr	replayEndRecPtr;
 	TimeLineID	replayEndTLI;
+	ConditionVariable replayProgressCV;
 	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 	TimestampTz recoveryLastXTime;
 
@@ -5342,9 +5343,67 @@ XLOGShmemInit(void)
 	SpinLockInit(&XLogCtl->info_lck);
 	SpinLockInit(&XLogCtl->ulsn_lck);
 	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
+	ConditionVariableInit(&XLogCtl->replayProgressCV);
 	ConditionVariableInit(&XLogCtl->recoveryNotPausedCV);
 }
 
+/*
+ * Wait for recovery to complete replaying all WAL up to and including
+ * redoEndRecPtr.
+ *
+ * This gets woken up for every WAL record replayed, so make sure you're not
+ * trying to wait an LSN that is too far in the future.
+ */
+void
+XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr)
+{
+	static XLogRecPtr replayRecPtr = 0;
+
+	if (!RecoveryInProgress())
+		return;
+
+	/*
+	 * Check the backend-local variable first, we may be able to skip accessing
+	 * shared memory (which requires locking)
+	 */
+	if (redoEndRecPtr <= replayRecPtr)
+		return;
+
+	replayRecPtr = GetXLogReplayRecPtr(NULL);
+
+	/*
+	 * Check again if we're going to need to wait, now that we've updated
+	 * the local cached variable.
+	 */
+	if (redoEndRecPtr <= replayRecPtr)
+		return;
+
+	/*
+	 * We need to wait for the variable, so prepare for that.
+	 * 
+	 * Note: This wakes up every time a WAL record is replayed, so this can
+	 * be expensive.
+	 */
+	ConditionVariablePrepareToSleep(&XLogCtl->replayProgressCV);
+
+	while (redoEndRecPtr > replayRecPtr)
+	{
+		bool timeout;
+		timeout = ConditionVariableTimedSleep(&XLogCtl->replayProgressCV,
+											  10000000,
+											  WAIT_EVENT_RECOVERY_WAL_STREAM);
+		
+		if (timeout)
+			ereport(LOG,
+					(errmsg("Waiting for recovery to catch up to %X/%X",
+							LSN_FORMAT_ARGS(redoEndRecPtr))));
+		else
+			replayRecPtr = GetXLogReplayRecPtr(NULL);
+	}
+
+	ConditionVariableCancelSleep();
+}
+
 /*
  * This func must be called ONCE on system install.  It creates pg_control
  * and the initial XLOG segment.
@@ -7267,6 +7326,14 @@ StartupXLOG(void)
 	abortedRecPtr = InvalidXLogRecPtr;
 	missingContrecPtr = InvalidXLogRecPtr;
 
+	/*
+	 * Setup last written lsn cache, max written LSN.
+	 * Starting from here, we could be modifying pages through REDO, which requires
+	 * the existance of maxLwLsn + LwLsn LRU.
+	 */
+	XLogCtl->maxLastWrittenLsn = RedoRecPtr;
+	dlist_init(&XLogCtl->lastWrittenLsnLRU);
+
 	/* REDO */
 	if (InRecovery)
 	{
@@ -7774,6 +7841,8 @@ StartupXLOG(void)
 						WalSndWakeup();
 				}
 
+				ConditionVariableBroadcast(&XLogCtl->replayProgressCV);
+
 				/* Exit loop if we reached inclusive recovery target */
 				if (recoveryStopsAfter(xlogreader))
 				{
@@ -8169,8 +8238,6 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-	XLogCtl->maxLastWrittenLsn = EndOfLog;
-	dlist_init(&XLogCtl->lastWrittenLsnLRU);
 
 	LocalSetXLogInsertAllowed();
 
@@ -10980,11 +11047,14 @@ xlog_redo(XLogReaderState *record)
 			XLogRedoAction result;
 
 			result = XLogReadBufferForRedo(record, block_id, &buffer);
-			if (result == BLK_DONE && !IsUnderPostmaster)
+			if (result == BLK_DONE && (!IsUnderPostmaster || StandbyMode))
 			{
 				/*
-				 * In the special WAL process, blocks that are being ignored
-				 * return BLK_DONE. Accept that.
+				 * NEON: In the special WAL redo process, blocks that are being
+				 * ignored return BLK_DONE. Accept that.
+				 * Additionally, in standby mode, blocks that are not present
+				 * in shared buffers are ignored during replay, so we also
+				 * ignore those blocks.
 				 */
 			}
 			else if (result != BLK_RESTORED)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index b784cbf0493..cf7b4e7c4c3 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -323,6 +323,7 @@ extern bool HotStandbyActive(void);
 extern bool HotStandbyActiveInReplay(void);
 extern bool XLogInsertAllowed(void);
 extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
+extern void XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr);
 extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
 extern XLogRecPtr GetXLogInsertRecPtr(void);
 extern XLogRecPtr GetXLogWriteRecPtr(void);
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 7cebdf3af6d..d2ba26f66b7 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -33,6 +33,10 @@ typedef enum
 								 * need to be replayed) */
 } XLogRedoAction;
 
+/*
+ * Returns true if we shouldn't do REDO on that block in record indicated by
+ * block_id; false otherwise.
+ */
 extern bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
 
 extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record,

From 74571e12347da9902c7c91d8e42bfe5455e0113e Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Fri, 7 Jul 2023 13:26:04 +0300
Subject: [PATCH 198/214] Do not allow users with CREATEROLE privelege to
 manage system user groups.

---
 src/backend/commands/user.c | 75 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 4621bba00e5..7e4c510d2bc 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -54,7 +54,8 @@ static void AddRoleMems(const char *rolename, Oid roleid,
 static void DelRoleMems(const char *rolename, Oid roleid,
 						List *memberSpecs, List *memberIds,
 						bool admin_opt);
-
+static void check_role_membership_authorization(Oid currentUserId, Oid roleid,
+												bool is_grant);
 
 /* Check if current user has createrole privileges */
 static bool
@@ -1475,6 +1476,8 @@ AddRoleMems(const char *rolename, Oid roleid,
 	if (!memberIds)
 		return;
 
+	check_role_membership_authorization(grantorId, roleid, true);
+
 	/*
 	 * Check permissions: must have createrole or admin option on the role to
 	 * be changed.  To mess with a superuser role, you gotta be superuser.
@@ -1653,6 +1656,8 @@ DelRoleMems(const char *rolename, Oid roleid,
 	if (!memberIds)
 		return;
 
+	check_role_membership_authorization(GetUserId(), roleid, false);
+
 	/*
 	 * Check permissions: must have createrole or admin option on the role to
 	 * be changed.  To mess with a superuser role, you gotta be superuser.
@@ -1735,3 +1740,71 @@ DelRoleMems(const char *rolename, Oid roleid,
 	 */
 	table_close(pg_authmem_rel, NoLock);
 }
+
+/*
+ * Check that currentUserId has permission to modify the membership list for
+ * roleid. Throw an error if not.
+ */
+static void
+check_role_membership_authorization(Oid currentUserId, Oid roleid,
+									bool is_grant)
+{
+	/*
+	 * The charter of pg_database_owner is to have exactly one, implicit,
+	 * situation-dependent member.  There's no technical need for this
+	 * restriction.  (One could lift it and take the further step of making
+	 * object_ownercheck(DatabaseRelationId, ...) equivalent to
+	 * has_privs_of_role(roleid, ROLE_PG_DATABASE_OWNER), in which case
+	 * explicit, situation-independent members could act as the owner of any
+	 * database.)
+	 */
+	if (is_grant && roleid == ROLE_PG_DATABASE_OWNER)
+		ereport(ERROR,
+				errmsg("role \"%s\" cannot have explicit members",
+					   GetUserNameFromId(roleid, false)));
+
+	/* To mess with a superuser role, you gotta be superuser. */
+	if (superuser_arg(roleid))
+	{
+		if (!superuser_arg(currentUserId))
+		{
+			if (is_grant)
+				ereport(ERROR,
+						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+						 errmsg("permission denied to grant role \"%s\"",
+								GetUserNameFromId(roleid, false)),
+						 errdetail("Only roles with the %s attribute may grant roles with the %s attribute.",
+								   "SUPERUSER", "SUPERUSER")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+						 errmsg("permission denied to revoke role \"%s\"",
+								GetUserNameFromId(roleid, false)),
+						 errdetail("Only roles with the %s attribute may revoke roles with the %s attribute.",
+								   "SUPERUSER", "SUPERUSER")));
+		}
+	}
+	else
+	{
+		/*
+		 * Otherwise, must have admin option on the role to be changed.
+		 */
+		if (!is_admin_of_role(currentUserId, roleid))
+		{
+			if (is_grant)
+				ereport(ERROR,
+						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+						 errmsg("permission denied to grant role \"%s\"",
+								GetUserNameFromId(roleid, false)),
+						 errdetail("Only roles with the %s option on role \"%s\" may grant this role.",
+								   "ADMIN", GetUserNameFromId(roleid, false))));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+						 errmsg("permission denied to revoke role \"%s\"",
+								GetUserNameFromId(roleid, false)),
+						 errdetail("Only roles with the %s option on role \"%s\" may revoke this role.",
+								   "ADMIN", GetUserNameFromId(roleid, false))));
+		}
+	}
+}

From 6b4e16e8662cd3572004fcc40f699653dd51c39c Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Sat, 15 Jul 2023 20:46:59 +0300
Subject: [PATCH 199/214] Fix regression tests after the patch with CREATEROLE
 restrictions

---
 src/test/regress/expected/privileges.out | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index edf5f3a6b16..304d9c6116f 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -1559,7 +1559,8 @@ REFRESH MATERIALIZED VIEW sro_mv;
 ERROR:  cannot fire deferred trigger within security-restricted operation
 CONTEXT:  SQL function "mv_action" statement 1
 BEGIN; SET CONSTRAINTS ALL IMMEDIATE; REFRESH MATERIALIZED VIEW sro_mv; COMMIT;
-ERROR:  must have admin option on role "regress_priv_group2"
+ERROR:  permission denied to grant role "regress_priv_group2"
+DETAIL:  Only roles with the ADMIN option on role "regress_priv_group2" may grant this role.
 CONTEXT:  SQL function "unwanted_grant" statement 1
 SQL statement "SELECT unwanted_grant()"
 PL/pgSQL function sro_trojan() line 1 at PERFORM
@@ -1589,10 +1590,12 @@ CREATE FUNCTION dogrant_ok() RETURNS void LANGUAGE sql SECURITY DEFINER AS
 GRANT regress_priv_group2 TO regress_priv_user5; -- ok: had ADMIN OPTION
 SET ROLE regress_priv_group2;
 GRANT regress_priv_group2 TO regress_priv_user5; -- fails: SET ROLE suspended privilege
-ERROR:  must have admin option on role "regress_priv_group2"
+ERROR:  permission denied to grant role "regress_priv_group2"
+DETAIL:  Only roles with the ADMIN option on role "regress_priv_group2" may grant this role.
 SET SESSION AUTHORIZATION regress_priv_user1;
 GRANT regress_priv_group2 TO regress_priv_user5; -- fails: no ADMIN OPTION
-ERROR:  must have admin option on role "regress_priv_group2"
+ERROR:  permission denied to grant role "regress_priv_group2"
+DETAIL:  Only roles with the ADMIN option on role "regress_priv_group2" may grant this role.
 SELECT dogrant_ok();			-- ok: SECURITY DEFINER conveys ADMIN
 NOTICE:  role "regress_priv_user5" is already a member of role "regress_priv_group2"
  dogrant_ok 
@@ -1602,14 +1605,16 @@ NOTICE:  role "regress_priv_user5" is already a member of role "regress_priv_gro
 
 SET ROLE regress_priv_group2;
 GRANT regress_priv_group2 TO regress_priv_user5; -- fails: SET ROLE did not help
-ERROR:  must have admin option on role "regress_priv_group2"
+ERROR:  permission denied to grant role "regress_priv_group2"
+DETAIL:  Only roles with the ADMIN option on role "regress_priv_group2" may grant this role.
 SET SESSION AUTHORIZATION regress_priv_group2;
 GRANT regress_priv_group2 TO regress_priv_user5; -- ok: a role can self-admin
 NOTICE:  role "regress_priv_user5" is already a member of role "regress_priv_group2"
 CREATE FUNCTION dogrant_fails() RETURNS void LANGUAGE sql SECURITY DEFINER AS
 	'GRANT regress_priv_group2 TO regress_priv_user5';
 SELECT dogrant_fails();			-- fails: no self-admin in SECURITY DEFINER
-ERROR:  must have admin option on role "regress_priv_group2"
+ERROR:  permission denied to grant role "regress_priv_group2"
+DETAIL:  Only roles with the ADMIN option on role "regress_priv_group2" may grant this role.
 CONTEXT:  SQL function "dogrant_fails" statement 1
 DROP FUNCTION dogrant_fails();
 SET SESSION AUTHORIZATION regress_priv_user4;

From 56e27c3c70edfbb6eeada94e33956e8479705702 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 26 Jul 2023 15:54:11 +0300
Subject: [PATCH 200/214] Make it possible to grant self created roles (#297)

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/commands/user.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 7e4c510d2bc..cc066868113 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -498,6 +498,37 @@ CreateRole(ParseState *pstate, CreateRoleStmt *stmt)
 		}
 	}
 
+	/*
+	 * If the current user isn't a superuser, make them an admin of the new
+	 * role so that they can administer the new object they just created.
+	 * Superusers will be able to do that anyway.
+	 *
+	 * The grantor of record for this implicit grant is the bootstrap
+	 * superuser, which means that the CREATEROLE user cannot revoke the
+	 * grant. They can however grant the created role back to themselves with
+	 * different options, since they enjoy ADMIN OPTION on it.
+	 */
+	if (!superuser())
+	{
+		RoleSpec   *current_role = makeNode(RoleSpec);
+		List	   *memberSpecs;
+		List	   *memberIds = list_make1_oid(GetUserId());
+
+		current_role->roletype = ROLESPEC_CURRENT_ROLE;
+		current_role->location = -1;
+		memberSpecs = list_make1(current_role);
+
+		AddRoleMems(stmt->role, roleid,
+					memberSpecs, memberIds,
+					BOOTSTRAP_SUPERUSERID, true);
+
+		/*
+		 * We must make the implicit grant visible to the code below, else the
+		 * additional grants will fail.
+		 */
+		CommandCounterIncrement();
+	}
+
 	/*
 	 * Add the specified members to this new role. adminmembers get the admin
 	 * option, rolemembers don't.
@@ -1521,7 +1552,7 @@ AddRoleMems(const char *rolename, Oid roleid,
 	 * use an explicit grantor specification to take advantage of the session
 	 * user's self-admin right.
 	 */
-	if (grantorId != GetUserId() && !superuser())
+	if (grantorId != GetUserId() && grantorId != BOOTSTRAP_SUPERUSERID && !superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("must be superuser to set grantor")));

From 394f32eae427ba518f1675668a24c6e25e036a75 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 29 Jul 2023 08:10:09 +0300
Subject: [PATCH 201/214] Define NEON_SMGR in smgr.h to make it possible for
 extensions to use extetnded Neon SMGR API (#299)

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/include/storage/smgr.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 19e469e1cef..43fee9e052c 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -20,6 +20,12 @@
 
 struct f_smgr;
 
+/*
+ * Neon: extended SMGR API.
+ * This define can be used by extensions to determine that them are built for Neon.
+ */
+#define NEON_SMGR 1
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)

From 4bbdda2ff8fb8e3272454935daa359c4b3718100 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 13 Jun 2023 16:43:10 +0200
Subject: [PATCH 202/214] Request extension files and libraries from
 compute_ctl

---
 src/backend/commands/extension.c | 18 ++++++++
 src/backend/utils/fmgr/dfmgr.c   | 78 ++++++++++++++++++++++++++++++--
 src/include/fmgr.h               |  6 +++
 3 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index a71504fbcae..caa26c4249f 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -399,6 +399,7 @@ get_extension_script_directory(ExtensionControlFile *control)
 {
 	char		sharepath[MAXPGPATH];
 	char	   *result;
+	struct stat fst;
 
 	/*
 	 * The directory parameter can be omitted, absolute, or relative to the
@@ -414,6 +415,16 @@ get_extension_script_directory(ExtensionControlFile *control)
 	result = (char *) palloc(MAXPGPATH);
 	snprintf(result, MAXPGPATH, "%s/%s", sharepath, control->directory);
 
+	// If directory does not exist, check remote extension storage
+	if (stat(result, &fst) < 0)
+	{
+		// request download of extension files from for control->directory
+		if (download_extension_file_hook != NULL)
+		{
+			download_extension_file_hook(control->directory, false);
+		}
+	}
+
 	return result;
 }
 
@@ -1438,6 +1449,13 @@ CreateExtensionInternal(char *extensionName,
 	 * will get us there.
 	 */
 	filename = get_extension_script_filename(pcontrol, NULL, versionName);
+
+	// request download of extension files from compute_ctl
+	if (download_extension_file_hook != NULL)
+	{
+		download_extension_file_hook(extensionName, false);
+	}
+
 	if (stat(filename, &fst) == 0)
 	{
 		/* Easy, no extra scripts */
diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c
index e8c6cdde972..d90e3703108 100644
--- a/src/backend/utils/fmgr/dfmgr.c
+++ b/src/backend/utils/fmgr/dfmgr.c
@@ -36,6 +36,7 @@
 #include "storage/shmem.h"
 #include "utils/hsearch.h"
 
+download_extension_file_hook_type download_extension_file_hook = NULL;
 
 /* signatures for PostgreSQL-specific library init/fini functions */
 typedef void (*PG_init_t) (void);
@@ -81,11 +82,13 @@ static void incompatible_module_error(const char *libname,
 									  const Pg_magic_struct *module_magic_data) pg_attribute_noreturn();
 static void internal_unload_library(const char *libname);
 static bool file_exists(const char *name);
-static char *expand_dynamic_library_name(const char *name);
+static char *expand_dynamic_library_name(const char *name, bool *is_found);
 static void check_restricted_library_name(const char *name);
 static char *substitute_libpath_macro(const char *name);
 static char *find_in_dynamic_libpath(const char *basename);
 
+static void neon_try_load(const char *name);
+
 /* Magic structure that module needs to match to be accepted */
 static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA;
 
@@ -110,9 +113,20 @@ load_external_function(const char *filename, const char *funcname,
 	char	   *fullname;
 	void	   *lib_handle;
 	void	   *retval;
+	bool 		is_found = true;
 
 	/* Expand the possibly-abbreviated filename to an exact path name */
-	fullname = expand_dynamic_library_name(filename);
+	fullname = expand_dynamic_library_name(filename, &is_found);
+
+	// if file is not found, try to download it from compute_ctl
+	if (!is_found && download_extension_file_hook != NULL)
+	{
+		// try to download the file
+		elog(DEBUG3, "load_external_function: try to download file: %s", fullname);
+		neon_try_load(fullname);
+		// try to find file locally once again
+		fullname = expand_dynamic_library_name(filename, &is_found);
+	}
 
 	/* Load the shared library, unless we already did */
 	lib_handle = internal_load_library(fullname);
@@ -134,6 +148,47 @@ load_external_function(const char *filename, const char *funcname,
 	return retval;
 }
 
+void
+neon_try_load(const char *name)
+{
+	bool have_slash;
+	char *request_name;
+
+	// add .so suffix if it is not present
+	if (strstr(name, DLSUFFIX) == NULL)
+	{
+		request_name = psprintf("%s%s", name, DLSUFFIX);
+		elog(DEBUG3, "neon_try_load: add DLSUFFIX: %s", request_name);
+	}
+	else
+	{
+		request_name = pstrdup(name);
+		elog(DEBUG3, "neon_try_load: DLSUFFIX already present: %s", request_name);
+	}
+
+	have_slash = (first_dir_separator(request_name) != NULL);
+
+	if (strncmp(request_name, "$libdir/", strlen("$libdir/")) == 0)
+	{
+		char *new_request_name = psprintf("%s", request_name + strlen("$libdir/"));
+		pfree(request_name);
+		request_name = new_request_name;
+
+		elog(DEBUG3, "neon_try_load: omit $libdir/: %s", request_name);
+	}
+	else if (have_slash)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_NAME),
+					errmsg("unexpected path in dynamic library name: %s",
+						name)));
+	}
+
+	elog(DEBUG3, "neon_try_load: final request_name: %s", request_name);
+
+	download_extension_file_hook(request_name, true);
+}
+
 /*
  * This function loads a shlib file without looking up any particular
  * function in it.  If the same shlib has previously been loaded,
@@ -146,13 +201,24 @@ void
 load_file(const char *filename, bool restricted)
 {
 	char	   *fullname;
+	bool 		is_found = true;
 
 	/* Apply security restriction if requested */
 	if (restricted)
 		check_restricted_library_name(filename);
 
 	/* Expand the possibly-abbreviated filename to an exact path name */
-	fullname = expand_dynamic_library_name(filename);
+	fullname = expand_dynamic_library_name(filename, &is_found);
+
+	// if file is not found, try to download it from compute_ctl
+	if (!is_found && download_extension_file_hook != NULL)
+	{
+		// try to download the file
+		elog(DEBUG3, "load_file: try to download file: %s", fullname);
+		neon_try_load(fullname);
+		// try to find file locally once again
+		fullname = expand_dynamic_library_name(filename, &is_found);
+	}
 
 	/* Unload the library if currently loaded */
 	internal_unload_library(fullname);
@@ -173,7 +239,6 @@ lookup_external_function(void *filehandle, const char *funcname)
 	return dlsym(filehandle, funcname);
 }
 
-
 /*
  * Load the specified dynamic-link library file, unless it already is
  * loaded.  Return the pg_dl* handle for the file.
@@ -209,6 +274,7 @@ internal_load_library(const char *libname)
 					 errmsg("could not access file \"%s\": %m",
 							libname)));
 
+
 		for (file_scanner = file_list;
 			 file_scanner != NULL &&
 			 !SAME_INODE(stat_buf, *file_scanner);
@@ -483,7 +549,7 @@ file_exists(const char *name)
  * The result will always be freshly palloc'd.
  */
 static char *
-expand_dynamic_library_name(const char *name)
+expand_dynamic_library_name(const char *name, bool *is_found)
 {
 	bool		have_slash;
 	char	   *new;
@@ -529,9 +595,11 @@ expand_dynamic_library_name(const char *name)
 	 * If we can't find the file, just return the string as-is. The ensuing
 	 * load attempt will fail and report a suitable message.
 	 */
+	*is_found = false;
 	return pstrdup(name);
 }
 
+
 /*
  * Check a restricted library name.  It must begin with "$libdir/plugins/"
  * and there must not be any directory separators after that (this is
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index ab7b85c86e1..adcedeb4a82 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -774,4 +774,10 @@ extern PGDLLIMPORT fmgr_hook_type fmgr_hook;
 #define FmgrHookIsNeeded(fn_oid)							\
 	(!needs_fmgr_hook ? false : (*needs_fmgr_hook)(fn_oid))
 
+
+
+// download_extension_file_hook (filename, is_library)
+typedef bool (*download_extension_file_hook_type) (const char *, bool);
+extern PGDLLIMPORT download_extension_file_hook_type download_extension_file_hook;
+
 #endif							/* FMGR_H */

From cadbbccdb00e55b31ccb496a45c2153bcf5b67dc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 18 Oct 2023 15:32:18 +0300
Subject: [PATCH 203/214] Neon logical replication support for PG14 (#309)

* Neon logical replication support for PG14

* Log heap rewrite file after creation.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 src/backend/access/heap/rewriteheap.c       | 39 ++++++++++++++++++++-
 src/backend/access/transam/xlog.c           | 35 +++++++++++++++---
 src/backend/replication/logical/origin.c    | 19 ++++++++++
 src/backend/replication/logical/snapbuild.c | 11 ++++++
 src/backend/replication/slot.c              | 19 ++++++++++
 5 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2c159cf76a5..d3e756c9bf5 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -117,6 +117,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/logical.h"
+#include "replication/message.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
@@ -785,6 +786,36 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
  * ------------------------------------------------------------------------
  */
 
+/*
+ * NEON: we need to persist mapping file in WAL
+ */
+static void
+wallog_mapping_file(char const* path, int fd)
+{
+	char	prefix[MAXPGPATH];
+	snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+	if (fd < 0)
+	{
+		elog(DEBUG1, "neon: deleting contents of rewrite file %s", path);
+		/* unlink file */
+		LogLogicalMessage(prefix, NULL, 0, false);
+	}
+	else
+	{
+		off_t size = lseek(fd, 0, SEEK_END);
+		char* buf;
+		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, size);
+		if (size < 0)
+			elog(ERROR, "Failed to get size of mapping file: %m");
+		buf = palloc((size_t)size);
+		lseek(fd, 0, SEEK_SET);
+		if (read(fd, buf, (size_t)size) != size)
+			elog(ERROR, "Failed to read mapping file: %m");
+		LogLogicalMessage(prefix, buf, (size_t)size, false);
+		pfree(buf);
+	}
+}
+
 /*
  * Do preparations for logging logical mappings during a rewrite if
  * necessary. If we detect that we don't need to log anything we'll prevent
@@ -920,6 +951,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 					 errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
 							written, len)));
 		src->off += len;
+		wallog_mapping_file(src->path, FileGetRawDesc(src->vfd));
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) (&xlrec), sizeof(xlrec));
@@ -1006,7 +1038,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
 		src->off = 0;
 		memcpy(src->path, path, sizeof(path));
 		src->vfd = PathNameOpenFile(path,
-									O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
+									O_CREAT | O_EXCL | O_RDWR | PG_BINARY);
 		if (src->vfd < 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -1172,6 +1204,8 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
 				 errmsg("could not fsync file \"%s\": %m", path)));
 	pgstat_report_wait_end();
 
+	wallog_mapping_file(path, fd);
+
 	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -1247,6 +1281,7 @@ CheckPointLogicalRewriteHeap(void)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
+			wallog_mapping_file(path, -1);
 		}
 		else
 		{
@@ -1275,6 +1310,8 @@ CheckPointLogicalRewriteHeap(void)
 						 errmsg("could not fsync file \"%s\": %m", path)));
 			pgstat_report_wait_end();
 
+			wallog_mapping_file(path, fd);
+
 			if (CloseTransientFile(fd) != 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b4da9c40c44..81e1ef95feb 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -968,6 +968,7 @@ static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
 static void LocalSetXLogInsertAllowed(void);
 static void CreateEndOfRecoveryRecord(void);
 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
+static void PreCheckPointGuts(int flags);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
@@ -9573,6 +9574,11 @@ CreateCheckPoint(int flags)
 	 */
 	SyncPreCheckpoint();
 
+	/*
+	 * NEON: perform checkpiont action requiring write to the WAL before we determine the REDO pointer.
+	 */
+	PreCheckPointGuts(flags);
+
 	/*
 	 * Use a critical section to force system panic if we have trouble.
 	 */
@@ -10042,6 +10048,28 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
 	return recptr;
 }
 
+static void
+CheckPointReplicationState(void)
+{
+	CheckPointRelationMap();
+	CheckPointReplicationSlots();
+	CheckPointSnapBuild();
+	CheckPointLogicalRewriteHeap();
+	CheckPointReplicationOrigin();
+}
+
+/*
+ * NEON:  we use logical records to persist information of about slots, origins, relation map...
+ * If it is done inside shutdown checkpoint, then Postgres panics: "concurrent write-ahead log activity while database system is shutting down"
+ * So it before checkpoint REDO position is determined.
+ */
+static void
+PreCheckPointGuts(int flags)
+{
+	if (flags & CHECKPOINT_IS_SHUTDOWN)
+		CheckPointReplicationState();
+}
+
 /*
  * Flush all data in shared memory to disk, and fsync
  *
@@ -10051,11 +10079,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
 static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
-	CheckPointRelationMap();
-	CheckPointReplicationSlots();
-	CheckPointSnapBuild();
-	CheckPointLogicalRewriteHeap();
-	CheckPointReplicationOrigin();
+	if (!(flags & CHECKPOINT_IS_SHUTDOWN))
+		CheckPointReplicationState();
 
 	/* Write out all dirty data in SLRUs and the main buffer pool */
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c
index 6988840fd49..d66a038d4c9 100644
--- a/src/backend/replication/logical/origin.c
+++ b/src/backend/replication/logical/origin.c
@@ -81,6 +81,7 @@
 #include "nodes/execnodes.h"
 #include "pgstat.h"
 #include "replication/logical.h"
+#include "replication/message.h"
 #include "replication/origin.h"
 #include "storage/condition_variable.h"
 #include "storage/copydir.h"
@@ -562,10 +563,14 @@ CheckPointReplicationOrigin(void)
 	int			i;
 	uint32		magic = REPLICATION_STATE_MAGIC;
 	pg_crc32c	crc;
+	char	   *buf;
+	size_t      chkp_size;
 
 	if (max_replication_slots == 0)
 		return;
 
+	buf = palloc(sizeof(magic) + max_replication_slots*sizeof(ReplicationStateOnDisk) + sizeof(crc));
+
 	INIT_CRC32C(crc);
 
 	/* make sure no old temp file is remaining */
@@ -599,6 +604,9 @@ CheckPointReplicationOrigin(void)
 				 errmsg("could not write to file \"%s\": %m",
 						tmppath)));
 	}
+	memcpy(buf, &magic, sizeof magic);
+	chkp_size = sizeof(magic);
+
 	COMP_CRC32C(crc, &magic, sizeof(magic));
 
 	/* prevent concurrent creations/drops */
@@ -641,6 +649,8 @@ CheckPointReplicationOrigin(void)
 					 errmsg("could not write to file \"%s\": %m",
 							tmppath)));
 		}
+		memcpy(buf + chkp_size, &disk_state, sizeof(disk_state));
+		chkp_size += sizeof(disk_state);
 
 		COMP_CRC32C(crc, &disk_state, sizeof(disk_state));
 	}
@@ -660,6 +670,15 @@ CheckPointReplicationOrigin(void)
 				 errmsg("could not write to file \"%s\": %m",
 						tmppath)));
 	}
+	if (chkp_size != sizeof(magic)) /* has some valid origins */
+	{
+		memcpy(buf + chkp_size, &crc, sizeof crc);
+		chkp_size += sizeof(crc);
+
+		/* NEON specific: persist snapshot in storage using logical message */
+		LogLogicalMessage("neon-file:pg_logical/replorigin_checkpoint", buf, chkp_size, false);
+	}
+	pfree(buf);
 
 	if (CloseTransientFile(tmpfd) != 0)
 		ereport(PANIC,
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 4f8c00f717a..4b258faa77c 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -126,6 +126,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/logical.h"
+#include "replication/message.h"
 #include "replication/reorderbuffer.h"
 #include "replication/snapbuild.h"
 #include "storage/block.h"		/* debugging output */
@@ -1604,6 +1605,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 	int			fd;
 	char		tmppath[MAXPGPATH];
 	char		path[MAXPGPATH];
+	char		prefix[MAXPGPATH];
 	int			ret;
 	struct stat stat_buf;
 	Size		sz;
@@ -1726,6 +1728,10 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", tmppath)));
 
+	/* NEON specific: persist snapshot in storage using logical message */
+	snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+	LogLogicalMessage(prefix, (char*)ondisk, needed_length, false);
+
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE);
 	if ((write(fd, ondisk, needed_length)) != needed_length)
@@ -2032,6 +2038,7 @@ CheckPointSnapBuild(void)
 	DIR		   *snap_dir;
 	struct dirent *snap_de;
 	char		path[MAXPGPATH + 21];
+	char		prefix[MAXPGPATH + 31];
 
 	/*
 	 * We start off with a minimum of the last redo pointer. No new
@@ -2090,6 +2097,10 @@ CheckPointSnapBuild(void)
 		{
 			elog(DEBUG1, "removing snapbuild snapshot %s", path);
 
+			/* NEON specific: delete file from storage using logical message */
+			snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+			LogLogicalMessage(prefix, NULL, 0, false);
+
 			/*
 			 * It's not particularly harmful, though strange, if we can't
 			 * remove the file here. Don't prevent the checkpoint from
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 037a347cba0..ca6619c187b 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -45,6 +45,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/slot.h"
+#include "replication/message.h"
 #include "storage/fd.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -605,6 +606,15 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
 	sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
 	sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
 
+	if (SlotIsLogical(slot))
+	{
+		/* NEON specific: delete slot from storage using logical message */
+		char		prefix[MAXPGPATH];
+		snprintf(prefix, sizeof(prefix), "neon-file:%s/state", path);
+		elog(LOG, "Drop replication slot %s", path);
+		LogLogicalMessage(prefix, NULL, 0, false);
+	}
+
 	/*
 	 * Rename the slot directory on disk, so that we'll no longer recognize
 	 * this as a valid slot.  Note that if this fails, we've got to mark the
@@ -1569,6 +1579,15 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 				SnapBuildOnDiskChecksummedSize);
 	FIN_CRC32C(cp.checksum);
 
+	if (SlotIsLogical(slot) && cp.slotdata.restart_lsn != InvalidXLogRecPtr)
+	{
+		/* NEON specific: persist slot in storage using logical message */
+		char		prefix[MAXPGPATH];
+		snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+		elog(LOG, "Save replication slot at %s restart_lsn=%X/%X", path, 	LSN_FORMAT_ARGS(cp.slotdata.restart_lsn));
+		LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false);
+	}
+
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE);
 	if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))

From 1ce546925060158308a01b60e588a5af7e1f100b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 19 Oct 2023 15:57:23 +0300
Subject: [PATCH 204/214] Fix elog format error in wallog_mapping_file (#315)

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/access/heap/rewriteheap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index d3e756c9bf5..3fbc091ff70 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -804,7 +804,7 @@ wallog_mapping_file(char const* path, int fd)
 	{
 		off_t size = lseek(fd, 0, SEEK_END);
 		char* buf;
-		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, size);
+		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size);
 		if (size < 0)
 			elog(ERROR, "Failed to get size of mapping file: %m");
 		buf = palloc((size_t)size);

From ea711707a18a77a8478119a6044104c7c0d9f0f2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 19 Oct 2023 12:11:00 +0300
Subject: [PATCH 205/214] Remove excessive walsender reply logging.

---
 src/backend/replication/walsender.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 23572609144..c1f8f1c60ea 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1936,13 +1936,6 @@ ProcessStandbyReplyMessage(void)
 						applyPtr,
 						replyTime,
 						replyRequested);
-
-	elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X",
-					LSN_FORMAT_ARGS(writePtr));
-	elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X",
-					LSN_FORMAT_ARGS(flushPtr));
-	elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X",
-					LSN_FORMAT_ARGS(applyPtr));
 }
 
 void

From f56cd58eda7c2661a78e2a6876f1c811aca0ffc6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 3 Nov 2023 15:57:44 +0200
Subject: [PATCH 206/214] Update WAL buffers when restoring WAL at compute
 needed for LR (#325)

* Update WAL buffers when restoring WAL at compute needed for LR

* Fix copying data in WAL buffers

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/access/transam/xlog.c | 26 ++++++++++++++++++++++++++
 src/include/access/xlog.h         |  2 ++
 2 files changed, 28 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 81e1ef95feb..b40d249ad89 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -13757,3 +13757,29 @@ XLogRequestWalReceiverReply(void)
 {
 	doRequestWalReceiverReply = true;
 }
+
+void
+XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len)
+{
+	XLogRecPtr end;
+	int idx;
+	XLogRecPtr pagebegptr;
+
+	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+
+	end = start + len;
+	idx = XLogRecPtrToBufIdx(end);
+	pagebegptr = XLogCtl->xlblocks[idx] - XLOG_BLCKSZ;
+
+	if (pagebegptr + XLOG_BLCKSZ >= end && pagebegptr < end)
+	{
+		/* Last page of the segment is present in WAL buffers */
+		char* page = &XLogCtl->pages[idx * XLOG_BLCKSZ];
+		size_t overlap = end - pagebegptr;
+		if (overlap <= len)
+			memcpy(page, data + len - overlap, overlap);
+		else
+			memcpy(page + overlap - len, data, len);
+	}
+	LWLockRelease(WALBufMappingLock);
+}
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index cf7b4e7c4c3..32e5728eb15 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -377,6 +377,8 @@ extern void SetWalWriterSleeping(bool sleeping);
 extern void StartupRequestWalReceiverRestart(void);
 extern void XLogRequestWalReceiverReply(void);
 
+extern void XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len);
+
 extern void assign_max_wal_size(int newval, void *extra);
 extern void assign_checkpoint_completion_target(double newval, void *extra);
 

From b2fc5c6d43ca0fd979e00447e815e0c1dd395528 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 14 Dec 2023 10:04:08 -0800
Subject: [PATCH 207/214] Prevent output callbacks from hearing about neon-file
 messages (#330)

* Prevent output callbacks from hearing about neon-file messages
---
 src/backend/replication/logical/logical.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index f7d14919077..dd44d530f70 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -1174,6 +1174,8 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 
 	if (ctx->callbacks.message_cb == NULL)
 		return;
+	if (strcmp(prefix, "neon-file") == 0)
+		return;
 
 	/* Push callback + info on the error context stack */
 	state.ctx = ctx;
@@ -1489,6 +1491,8 @@ stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 	/* this callback is optional */
 	if (ctx->callbacks.stream_message_cb == NULL)
 		return;
+	if (strcmp(prefix, "neon-file")) == 0)
+		return;
 
 	/* Push callback + info on the error context stack */
 	state.ctx = ctx;

From 546a6d1628c72c7c42315e068facb0c8e46d6ecd Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 15 Dec 2023 09:26:39 -0800
Subject: [PATCH 208/214] strncmp vs strcmp

---
 src/backend/replication/logical/logical.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index dd44d530f70..0d257cc5e34 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -1174,7 +1174,7 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 
 	if (ctx->callbacks.message_cb == NULL)
 		return;
-	if (strcmp(prefix, "neon-file") == 0)
+	if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0)
 		return;
 
 	/* Push callback + info on the error context stack */
@@ -1491,7 +1491,7 @@ stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 	/* this callback is optional */
 	if (ctx->callbacks.stream_message_cb == NULL)
 		return;
-	if (strcmp(prefix, "neon-file")) == 0)
+	if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0)
 		return;
 
 	/* Push callback + info on the error context stack */

From ab9db70abbd1b3e6a92de7545cb8ba8a7966ee3e Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Thu, 11 Jan 2024 12:28:34 -0800
Subject: [PATCH 209/214] Allow creating publications FOR ALL TABLES

---
 src/backend/commands/publicationcmds.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c
index e288dd41cdc..655f10ef833 100644
--- a/src/backend/commands/publicationcmds.c
+++ b/src/backend/commands/publicationcmds.c
@@ -138,6 +138,13 @@ parse_publication_options(List *options,
 	}
 }
 
+static bool
+is_neon_superuser(void)
+{
+       Oid neon_superuser_oid = get_role_oid("neon_superuser", true /*missing_ok*/);
+       return neon_superuser_oid != InvalidOid && has_privs_of_role(GetCurrentRoleId(), neon_superuser_oid);
+}
+
 /*
  * Create new publication.
  */
@@ -163,7 +170,7 @@ CreatePublication(CreatePublicationStmt *stmt)
 					   get_database_name(MyDatabaseId));
 
 	/* FOR ALL TABLES requires superuser */
-	if (stmt->for_all_tables && !superuser())
+	if (stmt->for_all_tables && !superuser() && !is_neon_superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("must be superuser to create FOR ALL TABLES publication")));

From 0309285bcfe681992d754acd2c20f5e08fa8bbc6 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 23 Jan 2024 16:30:18 -0800
Subject: [PATCH 210/214] Switch GetCurrentRoleId to GetUserId

---
 src/backend/commands/publicationcmds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c
index 655f10ef833..4b06f736a63 100644
--- a/src/backend/commands/publicationcmds.c
+++ b/src/backend/commands/publicationcmds.c
@@ -142,7 +142,7 @@ static bool
 is_neon_superuser(void)
 {
        Oid neon_superuser_oid = get_role_oid("neon_superuser", true /*missing_ok*/);
-       return neon_superuser_oid != InvalidOid && has_privs_of_role(GetCurrentRoleId(), neon_superuser_oid);
+       return neon_superuser_oid != InvalidOid && has_privs_of_role(GetUserId(), neon_superuser_oid);
 }
 
 /*

From f378dcd9a1044487e9bbe879f4ce66f765ff1d8d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 25 Jan 2024 21:28:33 +0000
Subject: [PATCH 211/214] Support creating subscriptions as neon_superuser

---
 src/backend/commands/publicationcmds.c  |  7 -------
 src/backend/commands/subscriptioncmds.c |  4 ++--
 src/backend/utils/adt/acl.c             | 12 ++++++++++++
 src/include/miscadmin.h                 |  4 ++++
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c
index 4b06f736a63..ea5dc1d60cc 100644
--- a/src/backend/commands/publicationcmds.c
+++ b/src/backend/commands/publicationcmds.c
@@ -138,13 +138,6 @@ parse_publication_options(List *options,
 	}
 }
 
-static bool
-is_neon_superuser(void)
-{
-       Oid neon_superuser_oid = get_role_oid("neon_superuser", true /*missing_ok*/);
-       return neon_superuser_oid != InvalidOid && has_privs_of_role(GetUserId(), neon_superuser_oid);
-}
-
 /*
  * Create new publication.
  */
diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c
index b1a2f3f81a2..bda0ef0f2c5 100644
--- a/src/backend/commands/subscriptioncmds.c
+++ b/src/backend/commands/subscriptioncmds.c
@@ -374,7 +374,7 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
 	if (create_slot)
 		PreventInTransactionBlock(isTopLevel, "CREATE SUBSCRIPTION ... WITH (create_slot = true)");
 
-	if (!superuser())
+	if (!superuser() && !is_neon_superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("must be superuser to create subscriptions")));
@@ -1389,7 +1389,7 @@ AlterSubscriptionOwner_internal(Relation rel, HeapTuple tup, Oid newOwnerId)
 					   NameStr(form->subname));
 
 	/* New owner must be a superuser */
-	if (!superuser_arg(newOwnerId))
+	if (!superuser_arg(newOwnerId) && !is_neon_superuser_arg(newOwnerId))
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied to change owner of subscription \"%s\"",
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index a44428da7ae..0e5cfb0e58d 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -114,6 +114,18 @@ static AclResult pg_role_aclcheck(Oid role_oid, Oid roleid, AclMode mode);
 
 static void RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue);
 
+bool
+is_neon_superuser(void)
+{
+	return is_neon_superuser_arg(GetUserId());
+}
+
+bool
+is_neon_superuser_arg(Oid roleid)
+{
+	Oid neon_superuser_oid = get_role_oid("neon_superuser", true /*missing_ok*/);
+	return neon_superuser_oid != InvalidOid && has_privs_of_role(roleid, neon_superuser_oid);
+}
 
 /*
  * getid
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index fd5e542d95c..4a48eaa67a4 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -373,6 +373,10 @@ extern void SetCurrentRoleId(Oid roleid, bool is_superuser);
 extern bool superuser(void);	/* current user is superuser */
 extern bool superuser_arg(Oid roleid);	/* given user is superuser */
 
+/* in utils/adt/acl.c */
+extern bool is_neon_superuser(void); /* current user is neon_superuser */
+extern bool is_neon_superuser_arg(Oid roleid); /* given user is neon_superuser */
+
 
 /*****************************************************************************
  *	  pmod.h --																 *

From 018fb052011081dc2733d3118d12e5c36df6eba1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 Jan 2024 15:40:27 +0200
Subject: [PATCH 212/214] On demand downloading of SLRU segments (#333)

* On demand downloading of SLRU segments

* Fix smgr_read_slru_segment

* Fix bug in SimpleLruDownloadSegment

* Determine SLRU kind in extension

* Use ctl->PagePrecedes for SLRU page comparison in SimpleLruDownloadSegment to address wraparround

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++----
 src/backend/storage/smgr/smgr.c   |  16 +++++
 src/include/storage/smgr.h        |   5 +-
 3 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 3ee01ef1a21..c4a5e48f77a 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -58,6 +58,7 @@
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/shmem.h"
+#include "storage/smgr.h"
 
 #define SlruFileName(ctl, path, seg) \
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
@@ -616,6 +617,66 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 	SlruInternalWritePage(ctl, slotno, NULL);
 }
 
+
+/*
+ * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer
+ * to download them on demand to reduce startup time.
+ * If SLRU segment is not found, we try to download it from page server
+ */
+static int
+SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path)
+{
+	int segno;
+	int fd = -1;
+	int n_blocks;
+	char* buffer;
+
+	static SMgrRelationData dummy_smgr_rel = {0};
+
+	/* If page is greater than latest written page, then do not try to download segment from server */
+	if (ctl->PagePrecedes(ctl->shared->latest_page_number, pageno))
+		return -1;
+
+	if (!dummy_smgr_rel.smgr)
+	{
+		RelFileNode rnode = {0};
+		dummy_smgr_rel.smgr = smgr(InvalidBackendId, rnode);
+	}
+	segno = pageno / SLRU_PAGES_PER_SEGMENT;
+
+	buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT);
+	n_blocks = smgr_read_slru_segment(&dummy_smgr_rel, path, segno, buffer);
+	if (n_blocks > 0)
+	{
+		fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+		if (fd < 0)
+		{
+			slru_errcause = SLRU_OPEN_FAILED;
+			slru_errno = errno;
+			pfree(buffer);
+			return -1;
+		}
+		errno = 0;
+		pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
+		if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ)
+		{
+			pgstat_report_wait_end();
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+			slru_errcause = SLRU_WRITE_FAILED;
+			slru_errno = errno;
+
+			CloseTransientFile(fd);
+			pfree(buffer);
+			return -1;
+		}
+		pgstat_report_wait_end();
+	}
+	pfree(buffer);
+	return fd;
+}
+
 /*
  * Return whether the given page exists on disk.
  *
@@ -643,12 +704,18 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno)
 	{
 		/* expected: file doesn't exist */
 		if (errno == ENOENT)
-			return false;
-
-		/* report error normally */
-		slru_errcause = SLRU_OPEN_FAILED;
-		slru_errno = errno;
-		SlruReportIOError(ctl, pageno, 0);
+		{
+			fd = SimpleLruDownloadSegment(ctl, pageno, path);
+			if (fd < 0)
+				return false;
+		}
+		else
+		{
+			/* report error normally */
+			slru_errcause = SLRU_OPEN_FAILED;
+			slru_errno = errno;
+			SlruReportIOError(ctl, pageno, 0);
+		}
 	}
 
 	if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
@@ -702,18 +769,30 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
 	if (fd < 0)
 	{
-		if (errno != ENOENT || !InRecovery)
+		if (errno != ENOENT)
 		{
 			slru_errcause = SLRU_OPEN_FAILED;
 			slru_errno = errno;
 			return false;
 		}
-
-		ereport(LOG,
-				(errmsg("file \"%s\" doesn't exist, reading as zeroes",
-						path)));
-		MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-		return true;
+		fd = SimpleLruDownloadSegment(ctl, pageno, path);
+		if (fd < 0)
+		{
+			if (!InRecovery)
+			{
+				slru_errcause = SLRU_OPEN_FAILED;
+				slru_errno = errno;
+				return false;
+			}
+			else
+			{
+				ereport(LOG,
+						(errmsg("file \"%s\" doesn't exist, reading as zeroes",
+								path)));
+				MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+				return true;
+			}
+		}
 	}
 
 	errno = 0;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index f1e676bcc3e..574205a526f 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -538,6 +538,22 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 										buffer, skipFsync);
 }
 
+/*
+ * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer
+ * to download them on demand to reduce startup time.
+ * If SLRU segment is not found, we try to download it from page server
+ *
+ * This function returns number of blocks in segment. Usually it should be SLRU_PAGES_PER_SEGMENT but in case
+ * of partial segment, it can be smaller. Zero value means that segment doesn't exist.
+ * From Postgres point of view empty segment is the same as absent segment.
+ */
+int
+smgr_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
+{
+	return (*reln->smgr).smgr_read_slru_segment ? (*reln->smgr).smgr_read_slru_segment(reln, path, segno, buffer) : 0;
+}
+
+
 
 /*
  *	smgrwriteback() -- Trigger kernel writeback for the supplied range of
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 43fee9e052c..79a76594a41 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -88,7 +88,6 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
 
-
 /*
  * This struct of function pointers defines the API between smgr.c and
  * any individual storage manager module.  Note that smgr subfunctions are
@@ -129,6 +128,8 @@ typedef struct f_smgr
 	void		(*smgr_start_unlogged_build) (SMgrRelation reln);
 	void		(*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln);
 	void		(*smgr_end_unlogged_build) (SMgrRelation reln);
+
+	int  		(*smgr_read_slru_segment) (SMgrRelation reln, const char *path, int segno, void* buffer);
 } f_smgr;
 
 typedef void (*smgr_init_hook_type) (void);
@@ -180,4 +181,6 @@ extern void smgr_start_unlogged_build(SMgrRelation reln);
 extern void	smgr_finish_unlogged_build_phase_1(SMgrRelation reln);
 extern void smgr_end_unlogged_build(SMgrRelation reln);
 
+extern int  smgr_read_slru_segment(SMgrRelation reln, const char *path, int segno, void* buffer);
+
 #endif							/* SMGR_H */

From 06c440ac35cb75e2241c4e90c5dfca05348a64a3 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@neon.tech>
Date: Thu, 8 Feb 2024 19:02:20 +0200
Subject: [PATCH 213/214] Persist pgstat file to preserve statistic between
 sessions

---
 src/backend/access/heap/rewriteheap.c     | 38 ++---------------
 src/backend/postmaster/pgstat.c           |  7 ++++
 src/backend/replication/logical/message.c | 50 +++++++++++++++++++++++
 src/include/replication/message.h         |  3 ++
 4 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 3fbc091ff70..09701b5908e 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -786,36 +786,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
  * ------------------------------------------------------------------------
  */
 
-/*
- * NEON: we need to persist mapping file in WAL
- */
-static void
-wallog_mapping_file(char const* path, int fd)
-{
-	char	prefix[MAXPGPATH];
-	snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
-	if (fd < 0)
-	{
-		elog(DEBUG1, "neon: deleting contents of rewrite file %s", path);
-		/* unlink file */
-		LogLogicalMessage(prefix, NULL, 0, false);
-	}
-	else
-	{
-		off_t size = lseek(fd, 0, SEEK_END);
-		char* buf;
-		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size);
-		if (size < 0)
-			elog(ERROR, "Failed to get size of mapping file: %m");
-		buf = palloc((size_t)size);
-		lseek(fd, 0, SEEK_SET);
-		if (read(fd, buf, (size_t)size) != size)
-			elog(ERROR, "Failed to read mapping file: %m");
-		LogLogicalMessage(prefix, buf, (size_t)size, false);
-		pfree(buf);
-	}
-}
-
 /*
  * Do preparations for logging logical mappings during a rewrite if
  * necessary. If we detect that we don't need to log anything we'll prevent
@@ -951,7 +921,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 					 errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
 							written, len)));
 		src->off += len;
-		wallog_mapping_file(src->path, FileGetRawDesc(src->vfd));
+		wallog_file_descriptor(src->path, FileGetRawDesc(src->vfd));
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) (&xlrec), sizeof(xlrec));
@@ -1204,7 +1174,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
 				 errmsg("could not fsync file \"%s\": %m", path)));
 	pgstat_report_wait_end();
 
-	wallog_mapping_file(path, fd);
+	wallog_file_descriptor(path, fd);
 
 	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
@@ -1281,7 +1251,7 @@ CheckPointLogicalRewriteHeap(void)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
-			wallog_mapping_file(path, -1);
+			wallog_file_descriptor(path, -1);
 		}
 		else
 		{
@@ -1310,7 +1280,7 @@ CheckPointLogicalRewriteHeap(void)
 						 errmsg("could not fsync file \"%s\": %m", path)));
 			pgstat_report_wait_end();
 
-			wallog_mapping_file(path, fd);
+			wallog_file_descriptor(path, fd);
 
 			if (CloseTransientFile(fd) != 0)
 				ereport(ERROR,
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index a2f75b23b8b..3eb77b8fd2b 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -51,6 +51,7 @@
 #include "postmaster/fork_process.h"
 #include "postmaster/interrupt.h"
 #include "postmaster/postmaster.h"
+#include "replication/message.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/backendid.h"
@@ -3757,6 +3758,9 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	 */
 	list_free(pending_write_requests);
 	pending_write_requests = NIL;
+
+	if (XLogInsertAllowed())
+		wallog_file(statfile);
 }
 
 /*
@@ -3890,6 +3894,9 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 		elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
 		unlink(statfile);
 	}
+
+	if (XLogInsertAllowed())
+		wallog_file(statfile);
 }
 
 /* ----------
diff --git a/src/backend/replication/logical/message.c b/src/backend/replication/logical/message.c
index 93bd372421a..c53ddfb4fc0 100644
--- a/src/backend/replication/logical/message.c
+++ b/src/backend/replication/logical/message.c
@@ -31,6 +31,8 @@
 
 #include "postgres.h"
 
+#include <unistd.h>
+
 #include "access/xact.h"
 #include "miscadmin.h"
 #include "nodes/execnodes.h"
@@ -86,3 +88,51 @@ logicalmsg_redo(XLogReaderState *record)
 
 	/* This is only interesting for logical decoding, see decode.c. */
 }
+
+/*
+ * NEON: persist file in WAL to save it in persistent storage.
+ * If fd < 0, then remote entry from page server.
+ */
+void
+wallog_file_descriptor(char const* path, int fd)
+{
+	char	prefix[MAXPGPATH];
+	snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+	if (fd < 0)
+	{
+		elog(DEBUG1, "neon: deleting contents of rewrite file %s", path);
+		/* unlink file */
+		LogLogicalMessage(prefix, NULL, 0, false);
+	}
+	else
+	{
+		off_t size = lseek(fd, 0, SEEK_END);
+		char* buf;
+		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size);
+		if (size < 0)
+			elog(ERROR, "Failed to get size of mapping file: %m");
+		buf = palloc((size_t)size);
+		lseek(fd, 0, SEEK_SET);
+		if (read(fd, buf, (size_t)size) != size)
+			elog(ERROR, "Failed to read mapping file: %m");
+		LogLogicalMessage(prefix, buf, (size_t)size, false);
+		pfree(buf);
+	}
+}
+
+void
+wallog_file(char const* path)
+{
+	int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (fd < 0)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", path)));
+	}
+	else
+	{
+		wallog_file_descriptor(path, fd);
+		CloseTransientFile(fd);
+	}
+}
diff --git a/src/include/replication/message.h b/src/include/replication/message.h
index d3fb324c816..d1eb8349605 100644
--- a/src/include/replication/message.h
+++ b/src/include/replication/message.h
@@ -38,4 +38,7 @@ void		logicalmsg_redo(XLogReaderState *record);
 void		logicalmsg_desc(StringInfo buf, XLogReaderState *record);
 const char *logicalmsg_identify(uint8 info);
 
+extern void wallog_file(char const* path);
+extern void wallog_file_descriptor(char const* path, int fd);
+
 #endif							/* PG_LOGICAL_MESSAGE_H */

From 624f2b3998dea3bb3667d26dd454b30f45ea8686 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@neon.tech>
Date: Fri, 9 Feb 2024 10:22:07 +0200
Subject: [PATCH 214/214] Persistent pgstat file only if it is successfully
 created

---
 src/backend/postmaster/pgstat.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 3eb77b8fd2b..d52100c6554 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3748,6 +3748,8 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 						tmpfile, statfile)));
 		unlink(tmpfile);
 	}
+	else if (XLogInsertAllowed())
+		wallog_file(statfile);
 
 	if (permanent)
 		unlink(pgstat_stat_filename);
@@ -3758,9 +3760,6 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 	 */
 	list_free(pending_write_requests);
 	pending_write_requests = NIL;
-
-	if (XLogInsertAllowed())
-		wallog_file(statfile);
 }
 
 /*
@@ -3886,6 +3885,8 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 						tmpfile, statfile)));
 		unlink(tmpfile);
 	}
+	else if (XLogInsertAllowed())
+		wallog_file(statfile);
 
 	if (permanent)
 	{
@@ -3894,9 +3895,6 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
 		elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
 		unlink(statfile);
 	}
-
-	if (XLogInsertAllowed())
-		wallog_file(statfile);
 }
 
 /* ----------