diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 6bb34d2f4f7..8efd0368f65 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1489,6 +1489,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, bool newitemonleft, isleaf, isrightmost; + uint16 origcycleid; /* * origpage is the original page to be split. leftpage is a temporary @@ -1509,6 +1510,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, isrightmost = P_RIGHTMOST(oopaque); maxoff = PageGetMaxOffsetNumber(origpage); origpagenumber = BufferGetBlockNumber(buf); + /* NEON: store the page's former cycle ID for FPI check later */ + origcycleid = oopaque->btpo_cycleid; /* * Choose a point to split origpage at. @@ -1964,6 +1967,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; + uint8 bufflags = REGBUF_STANDARD; xlrec.level = ropaque->btpo_level; /* See comments below on newitem, orignewitem, and posting lists */ @@ -1976,7 +1980,27 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + /* + * NEON: If we split to earlier pages during a btree vacuum cycle, + * then we have to include the cycle ID in the WAL record. The + * easiest method to do that is to force an image, which happens to + * be relatively cheap, as the data already contained in the record is + * enough to populate the new right page. + * + * We MUST log an FPI when the page split during a vacuum cycle, and: + * - The right page's blckno < the left page's blckno, or + * - The right page might be 'C' in a page spit chain B > C > A after + * B split B > A => B > C > A; or B > C > D > A, etc. (as indicated + * by the presense of a cycle ID). + */ + if (oopaque->btpo_cycleid != 0 && + (origpagenumber > rightpagenumber || oopaque->btpo_cycleid == origcycleid)) + { + /* cycle ID is required */ + bufflags |= REGBUF_FORCE_IMAGE; + } + + XLogRegisterBuffer(0, buf, bufflags); XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); /* Log original right sibling, since we've changed its prev-pointer */ if (!isrightmost) diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index f9186ca233a..b6af307f5d9 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -434,6 +434,75 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) MarkBufferDirty(buf); } + /* + * NEON: If the original page was supposed to be recovered from FPI, + * then we need to correct the cycle ID (see _bt_split for reasons) + * + * Note that we can't just use the buffer in WALRedo on Pageserver, + * as that may be InvalidBuffer when the original (left) page of the + * split wasn't requested. + */ + if (XLogRecGetBlock(record, 0)->has_image) + { + /* + * btree split FPIs may contain important cycle IDs on the original + * page's FPI; make sure we correctly transfer this over + */ + + /* + * Because we don't want to decompress the page if it's not needed, or + * reconstruct a whole 8kB page when we're only interested in the 2 + * bytes of the bkpimg, we recognise there are 3 different ways we can + * get the data, in order of efficiency (from most efficient to least + * efficient): + * - There is an original (left) page in the buffer + * - There is original buffer, the logged FPI was not compressed + * - There is original buffer, the logged FPI was compressed + */ + if (BufferIsValid(buf)) + { + /* + * Neat, we can just use the buffer to copy the cycle ID + */ + BTPageOpaque oopaque = BTPageGetOpaque(BufferGetPage(buf)); + ropaque->btpo_cycleid = oopaque->btpo_cycleid; + } + else if (!BKPIMAGE_COMPRESSED(XLogRecGetBlock(record, 0)->bimg_info)) + { + /* + * Good, we don't have to decompress the data, so we can use + * calculated offsets into bkpb->bkp_image + */ + + /* + * offset of the start of cycleid relative to the end of the page, + * which is also relative to the end of the FPI + */ + const int cycleid_off = MAXALIGN(sizeof(BTPageOpaqueData)) + - offsetof(BTPageOpaqueData, btpo_cycleid); + char *cycleid_ptr; /* may not be aligned */ + DecodedBkpBlock *bkpb = XLogRecGetBlock(record, 0); + + cycleid_ptr = &bkpb->bkp_image[bkpb->bimg_len - cycleid_off]; + + memcpy(&ropaque->btpo_cycleid, cycleid_ptr, sizeof(BTCycleId)); + } + else + { + /* + * Bummer, we have to decompress the data. + */ + PGAlignedBlock tmp; + BTPageOpaque oopaque; + + /* Expensive decompression of data */ + RestoreBlockImage(record, 0, tmp.data); + + oopaque = BTPageGetOpaque(tmp.data); + ropaque->btpo_cycleid = oopaque->btpo_cycleid; + } + } + /* Fix left-link of the page to the right of the new right sibling */ if (spagenumber != P_NONE) {