Skip to content

Commit

Permalink
Abuse Checkpoint.redo field to distinguish shutdown checkpoints (#10010)
Browse files Browse the repository at this point in the history
This is for PR #6560. It implements the scheme I envisioned at
#6560 (comment)
  • Loading branch information
hlinnaka authored and Konstantin Knizhnik committed Dec 16, 2024
1 parent f5365b1 commit 95f4d5b
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 49 deletions.
5 changes: 2 additions & 3 deletions libs/postgres_ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
pub use v14::bindings::{PageHeaderData, XLogRecord};
pub use v14::xlog_utils::{
SIZEOF_CHECKPOINT, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD,
XLOG_SIZE_OF_XLOG_SHORT_PHD,
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};

pub use v14::bindings::{CheckPoint, ControlFileData};
Expand Down Expand Up @@ -279,7 +278,7 @@ pub fn generate_pg_control(
checkpoint_bytes: &[u8],
lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<(Bytes, u64)> {
) -> anyhow::Result<(Bytes, u64, bool)> {
dispatch_pgversion!(
pg_version,
pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
Expand Down
2 changes: 0 additions & 2 deletions libs/postgres_ffi/src/pg_constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,6 @@ pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */

pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 2;

/* Information stored in bimg_info */
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */

Expand Down
49 changes: 45 additions & 4 deletions libs/postgres_ffi/src/xlog_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,23 +124,64 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
}
}

/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN
///
/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in
/// the pageserver. They use the same format as the PostgreSQL control file and the
/// checkpoint record, but see walingest.rs for how exactly they are kept up to date.
/// 'lsn' is the LSN at which we're starting up.
///
/// Returns:
/// - pg_control file contents
/// - system_identifier, extracted from the persisted information
/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown
/// checkpoint at the given LSN
pub fn generate_pg_control(
pg_control_bytes: &[u8],
checkpoint_bytes: &[u8],
lsn: Lsn,
) -> anyhow::Result<(Bytes, u64)> {
) -> anyhow::Result<(Bytes, u64, bool)> {
let mut pg_control = ControlFileData::decode(pg_control_bytes)?;
let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?;
let was_shutdown;

// Generate new pg_control needed for bootstrap
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
//
// NB: In the checkpoint struct that we persist in the pageserver, we have a different
// convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint,
// 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to
// the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0.
//
// We didn't always have this convention however, and old persisted records will have
// old REDO values that point to some old LSN.
//
// The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown
// checkpoint record at that point in WAL, with no new WAL records after it. That case
// can be treated as starting from a clean shutdown. All other cases are treated as
// non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so
// that distinction doesn't matter very much. As of this writing, it only affects
// whether the persisted pg_stats information can be used or not.
//
// In the Checkpoint struct in the returned pg_control file, the redo pointer is
// always set to the LSN we're starting at, to hint that no WAL replay is required.
// (There's some neon-specific code in Postgres startup to make that work, though.
// Just setting the redo pointer is not sufficient.)
if Lsn(checkpoint.redo) == lsn {
was_shutdown = true;
} else {
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
was_shutdown = false;
}

//save new values in pg_control
// We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The
// neon-specific code at postgres startup ignores the state stored in the control
// file, similar to archive recovery in standalone PostgreSQL. Similarly, the
// checkPoint pointer is ignored, so just set it to 0.
pg_control.checkPoint = 0;
pg_control.checkPointCopy = checkpoint;
pg_control.state = DBState_DB_SHUTDOWNED;

Ok((pg_control.encode(), pg_control.system_identifier))
Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown))
}

pub fn get_current_timestamp() -> TimestampTz {
Expand Down
67 changes: 34 additions & 33 deletions pageserver/src/basebackup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};

use postgres_ffi::dispatch_pgversion;
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
use postgres_ffi::XLogFileName;
use postgres_ffi::PG_TLI;
use postgres_ffi::{dispatch_pgversion, CheckPoint};
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
use utils::lsn::Lsn;

Expand Down Expand Up @@ -255,16 +255,30 @@ where
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
// TODO include checksum

// Detect if we are creating the basebackup exactly at a shutdown checkpoint.
let normal_shutdown =
if let Ok(checkpoint_bytes) = self.timeline.get_checkpoint(self.lsn, self.ctx).await {
let checkpoint =
CheckPoint::decode(&checkpoint_bytes).context("deserialize checkpoint")?;
// We store "redo" field only for shutdown checkpoint
checkpoint.redo != 0
} else {
false
};
// Construct the pg_control file from the persisted checkpoint and pg_control
// information. But we only add this to the tarball at the end, so that if the
// writing is interrupted half-way through, the resulting incomplete tarball will
// be missing the pg_control file, which prevents PostgreSQL from starting up on
// it. With proper error handling, you should never try to start up from an
// incomplete basebackup in the first place, of course, but this is a nice little
// extra safety measure.
let checkpoint_bytes = self
.timeline
.get_checkpoint(self.lsn, self.ctx)
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_control_file(self.lsn, self.ctx)
.await
.context("failed get control bytes")?;
let (pg_control_bytes, system_identifier, was_shutdown) =
postgres_ffi::generate_pg_control(
&pg_control_bytes,
&checkpoint_bytes,
self.lsn,
self.timeline.pg_version,
)?;

let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;

Expand Down Expand Up @@ -403,7 +417,7 @@ where
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
// but now we should handle (skip) it for backward compatibility.
continue;
} else if path == "pg_stat/pgstat.stat" && !normal_shutdown {
} else if path == "pg_stat/pgstat.stat" && !was_shutdown {
// Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN
// of a shutdown checkpoint.
continue;
Expand Down Expand Up @@ -468,8 +482,9 @@ where
)))
});

// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
// Last, add the pg_control file and bootstrap WAL segment.
self.add_pgcontrol_file(pg_control_bytes, system_identifier)
.await?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
debug!("all tarred up!");
Ok(())
Expand Down Expand Up @@ -672,7 +687,11 @@ where
// Add generated pg_control file and bootstrap WAL segment.
// Also send zenith.signal file with extra bootstrap data.
//
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
async fn add_pgcontrol_file(
&mut self,
pg_control_bytes: Bytes,
system_identifier: u64,
) -> Result<(), BasebackupError> {
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
Expand All @@ -695,24 +714,6 @@ where
.await
.map_err(BasebackupError::Client)?;

let checkpoint_bytes = self
.timeline
.get_checkpoint(self.lsn, self.ctx)
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_control_file(self.lsn, self.ctx)
.await
.context("failed get control bytes")?;

let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
&pg_control_bytes,
&checkpoint_bytes,
self.lsn,
self.timeline.pg_version,
)?;

//send pg_control
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
self.ar
Expand Down
47 changes: 40 additions & 7 deletions pageserver/src/walingest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1194,18 +1194,51 @@ impl WalIngest {
} else {
cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
}
// NB: We abuse the Checkpoint.redo field:
//
// There is no any field in CheckPoint which allows to distinguish online
// and shutdown checkpoint. And we need to know it to detect normal shutdown.
// Previously "redo" field was not set at all.
// So let's assign this field only for shutdown checkpoint and left it zero
// for online checkpoint.
// - In PostgreSQL, the Checkpoint struct doesn't store the information
// of whether this is an online checkpoint or a shutdown checkpoint. It's
// stored in the XLOG info field of the WAL record, shutdown checkpoints
// use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use
// XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers
// in the pageserver, however.
//
// - In PostgreSQL, the Checkpoint.redo field stores the *start* of the
// checkpoint record, if it's a shutdown checkpoint. But when we are
// starting from a shutdown checkpoint, the basebackup LSN is the *end*
// of the shutdown checkpoint WAL record. That makes it difficult to
// correctly detect whether we're starting from a shutdown record or
// not.
//
// To address both of those issues, we store 0 in the redo field if it's
// an online checkpoint record, and the record's *end* LSN if it's a
// shutdown checkpoint. We don't need the original redo pointer in neon,
// because we don't perform WAL replay at startup anyway, so we can get
// away with abusing the redo field like this.
//
// XXX: Ideally, we would persist the extra information in a more
// explicit format, rather than repurpose the fields of the Postgres
// struct like this. However, we already have persisted data like this,
// so we need to maintain backwards compatibility.
//
// NB: We didn't originally have this convention, so there are still old
// persisted records that didn't do this. Before, we didn't update the
// persisted redo field at all. That means that old records have a bogus
// redo pointer that points to some old value, from the checkpoint record
// that was originally imported from the data directory. If it was a
// project created in Neon, that means it points to the first checkpoint
// after initdb. That's OK for our purposes: all such old checkpoints are
// treated as old online checkpoints when the basebackup is created.
cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
xlog_checkpoint.redo
// Store the *end* LSN of the checkpoint record. Or to be precise,
// the start LSN of the *next* record, i.e. if the record ends
// exactly at page boundary, the redo LSN points to just after the
// page header on the next page.
lsn.into()
} else {
0
Lsn::INVALID.into()
};

// Write a new checkpoint key-value pair on every checkpoint record, even
// if nothing really changed. Not strictly required, but it seems nice to
// have some trace of the checkpoint records in the layer files at the same
Expand Down

0 comments on commit 95f4d5b

Please sign in to comment.