From 773bdfae3320bbfc50aa0545077a02854ab87c75 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Feb 2023 14:41:48 -0500 Subject: [PATCH] WIP: takeover installs This adds `bootc install --takeover` which moves the running container into RAM and invokes `systemctl switch-root` to it, then proceeds with an installation to the previously-used block device. A key use case here is to "takeover" a running cloud instance, e.g. provision the system via cloud-init or so which invokes `podman run --privileged ... bootc install --takeover`. At the current time, this is only scoped to "builtin" installation types. We could support `install-to-filesystem` type flows too by allowing externally-configured block storage setups to be run as part of the current container (or in the fully general case, a distinct container, though that adds a lot of complexity). --- lib/src/cli.rs | 12 ++ lib/src/install.rs | 129 ++++++++++---- lib/src/install/baseline.rs | 7 + lib/src/lib.rs | 2 + lib/src/privtests.rs | 13 ++ lib/src/systemtakeover.rs | 341 ++++++++++++++++++++++++++++++++++++ 6 files changed, 467 insertions(+), 37 deletions(-) create mode 100644 lib/src/systemtakeover.rs diff --git a/lib/src/cli.rs b/lib/src/cli.rs index 97cf96d4e..87bfe3af1 100644 --- a/lib/src/cli.rs +++ b/lib/src/cli.rs @@ -85,6 +85,8 @@ pub(crate) enum TestingOpts { RunPrivilegedIntegration {}, /// Execute integration tests that target a not-privileged ostree container RunContainerIntegration {}, + /// Copy the container as ostree commit to target root + CopySelfTo { target: Utf8PathBuf }, /// Block device setup for testing PrepTestInstallFilesystem { blockdev: Utf8PathBuf }, /// e2e test of install-to-filesystem @@ -368,6 +370,16 @@ where I: IntoIterator, I::Item: Into + Clone, { + let args = args + .into_iter() + .map(|v| Into::::into(v)) + .collect::>(); + if matches!( + args.get(0).and_then(|v| v.to_str()), + Some(crate::systemtakeover::BIN_NAME) + ) { + return crate::systemtakeover::run().await; + } let opt = Opt::parse_from(args); match opt { Opt::Upgrade(opts) => upgrade(opts).await, diff --git a/lib/src/install.rs b/lib/src/install.rs index 0bac6593f..9a813d9c6 100644 --- a/lib/src/install.rs +++ b/lib/src/install.rs @@ -6,7 +6,7 @@ // This sub-module is the "basic" installer that handles creating basic block device // and filesystem setup. -mod baseline; +pub(crate) mod baseline; use std::io::BufWriter; use std::io::Write; @@ -36,6 +36,9 @@ use crate::lsm::lsm_label; use crate::task::Task; use crate::utils::run_in_host_mountns; +/// The path we use to access files on the host +pub(crate) const HOST_RUNDIR: &str = "/run/host"; + /// The default "stateroot" or "osname"; see https://github.com/ostreedev/ostree/issues/2794 const STATEROOT_DEFAULT: &str = "default"; /// The toplevel boot directory @@ -171,6 +174,8 @@ pub(crate) struct SourceInfo { pub(crate) commit: String, /// Whether or not SELinux appears to be enabled in the source commit pub(crate) selinux: bool, + /// If we should find the image in sysroot/repo, not in containers/storage + pub(crate) from_ostree_repo: bool, } // Shared read-only global state @@ -303,11 +308,13 @@ impl SourceInfo { let root = root.downcast_ref::().unwrap(); let xattrs = root.xattrs(cancellable)?; let selinux = crate::lsm::xattrs_have_selinux(&xattrs); + let from_ostree_repo = false; Ok(Self { imageref, digest, commit, selinux, + from_ostree_repo, }) } } @@ -382,6 +389,14 @@ pub(crate) mod config { } } +pub(crate) fn import_config_from_host() -> ostree_container::store::ImageProxyConfig { + let skopeo_cmd = run_in_host_mountns("skopeo"); + ostree_container::store::ImageProxyConfig { + skopeo_cmd: Some(skopeo_cmd), + ..Default::default() + } +} + #[context("Creating ostree deployment")] async fn initialize_ostree_root_from_self( state: &State, @@ -407,12 +422,12 @@ async fn initialize_ostree_root_from_self( name: imgref.to_string(), }; ostree_container::OstreeImageReference { - sigverify: target_sigverify, + sigverify: target_sigverify.clone(), imgref, } } else { ostree_container::OstreeImageReference { - sigverify: target_sigverify, + sigverify: target_sigverify.clone(), imgref: state.source.imageref.clone(), } }; @@ -442,49 +457,72 @@ async fn initialize_ostree_root_from_self( let sysroot = ostree::Sysroot::new(Some(&gio::File::for_path(rootfs))); sysroot.load(cancellable)?; + let dest_repo = &sysroot.repo().unwrap(); // We need to fetch the container image from the root mount namespace - let skopeo_cmd = run_in_host_mountns("skopeo"); - let proxy_cfg = ostree_container::store::ImageProxyConfig { - skopeo_cmd: Some(skopeo_cmd), - ..Default::default() - }; - - let mut temporary_dir = None; - let src_imageref = if skopeo_supports_containers_storage()? { - // We always use exactly the digest of the running image to ensure predictability. - let spec = - crate::utils::digested_pullspec(&state.source.imageref.name, &state.source.digest); - ostree_container::ImageReference { - transport: ostree_container::Transport::ContainerStorage, - name: spec, - } - } else { - let td = tempfile::tempdir_in("/var/tmp")?; - let path: &Utf8Path = td.path().try_into().unwrap(); - let r = copy_to_oci(&state.source.imageref, path)?; - temporary_dir = Some(td); - r - }; - let src_imageref = ostree_container::OstreeImageReference { - // There are no signatures to verify since we're fetching the already - // pulled container. - sigverify: ostree_container::SignatureSource::ContainerPolicyAllowInsecure, - imgref: src_imageref, - }; + let proxy_cfg = import_config_from_host(); let kargs = root_setup .kargs .iter() .map(|v| v.as_str()) .collect::>(); + + // Default image reference pulls from the running container image. + let mut src_imageref = ostree_container::OstreeImageReference { + // There are no signatures to verify since we're fetching the already + // pulled container. + sigverify: SignatureSource::ContainerPolicyAllowInsecure, + imgref: state.source.imageref.clone(), + }; #[allow(clippy::needless_update)] - let options = ostree_container::deploy::DeployOpts { + let mut options = ostree_container::deploy::DeployOpts { kargs: Some(kargs.as_slice()), - target_imgref: Some(&target_imgref), proxy_cfg: Some(proxy_cfg), ..Default::default() }; + + let mut temporary_dir = None; + if state.source.from_ostree_repo { + let root = Dir::open_ambient_dir("/", cap_std::ambient_authority())?; + let host_repo = { + let repodir = root + .open_dir("sysroot/repo") + .context("Opening sysroot/repo")?; + ostree::Repo::open_at_dir(&repodir, ".")? + }; + ostree_container::store::copy_as( + &host_repo, + &state.source.imageref, + &dest_repo, + &target_imgref.imgref, + ) + .await + .context("Copying image from host repo")?; + // We already copied the image, so src == target + src_imageref = target_imgref.clone(); + options.target_imgref = None; + } else { + if skopeo_supports_containers_storage()? { + // We always use exactly the digest of the running image to ensure predictability. + let spec = + crate::utils::digested_pullspec(&state.source.imageref.name, &state.source.digest); + ostree_container::ImageReference { + transport: ostree_container::Transport::ContainerStorage, + name: spec, + } + } else { + let td = tempfile::tempdir_in("/var/tmp")?; + let path: &Utf8Path = td.path().try_into().unwrap(); + let r = copy_to_oci(&state.source.imageref, path)?; + temporary_dir = Some(td); + r + }; + // In this case the deploy code is pulling the container, so set it up to + // generate a target image reference. + options.target_imgref = Some(&target_imgref); + } + println!("Creating initial deployment"); let state = ostree_container::deploy::deploy(&sysroot, stateroot, &src_imageref, Some(options)).await?; @@ -811,11 +849,16 @@ fn installation_complete() { println!("Installation complete!"); } -/// Implementation of the `bootc install` CLI command. -pub(crate) async fn install(opts: InstallOpts) -> Result<()> { - let block_opts = opts.block_opts; - let state = prepare_install(opts.config_opts, opts.target_opts).await?; +pub(crate) async fn install_takeover( + opts: InstallBlockDeviceOpts, + state: Arc, +) -> Result<()> { + // The takeover code should have unset this + assert!(!opts.takeover); + block_install_impl(opts, state).await +} +async fn block_install_impl(block_opts: InstallBlockDeviceOpts, state: Arc) -> Result<()> { // This is all blocking stuff let mut rootfs = { let state = state.clone(); @@ -841,6 +884,18 @@ pub(crate) async fn install(opts: InstallOpts) -> Result<()> { Ok(()) } +/// Implementation of the `bootc install` CLI command. +pub(crate) async fn install(opts: InstallOpts) -> Result<()> { + let block_opts = opts.block_opts; + let state = prepare_install(opts.config_opts, opts.target_opts).await?; + if block_opts.takeover { + tracing::debug!("Performing takeover installation from host"); + return crate::systemtakeover::run_from_host(block_opts, state).await; + } + + block_install_impl(block_opts, state).await +} + #[context("Verifying empty rootfs")] fn require_empty_rootdir(rootfs_fd: &Dir) -> Result<()> { for e in rootfs_fd.entries()? { diff --git a/lib/src/install/baseline.rs b/lib/src/install/baseline.rs index cc547f9fa..e36e6c3f4 100644 --- a/lib/src/install/baseline.rs +++ b/lib/src/install/baseline.rs @@ -84,6 +84,13 @@ pub(crate) struct InstallBlockDeviceOpts { #[serde(default)] pub(crate) wipe: bool, + /// Write to the block device containing the running root filesystem. + /// This is implemented by moving the container into memory and switching + /// root (terminating all other processes). + #[clap(long)] + #[serde(default)] + pub(crate) takeover: bool, + /// Target root block device setup. /// /// direct: Filesystem written directly to block device diff --git a/lib/src/lib.rs b/lib/src/lib.rs index d1f031520..69daa3b49 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -37,6 +37,8 @@ pub(crate) mod mount; #[cfg(feature = "install")] mod podman; #[cfg(feature = "install")] +pub(crate) mod systemtakeover; +#[cfg(feature = "install")] mod task; #[cfg(feature = "docgen")] diff --git a/lib/src/privtests.rs b/lib/src/privtests.rs index e476aee17..d998421c4 100644 --- a/lib/src/privtests.rs +++ b/lib/src/privtests.rs @@ -2,6 +2,8 @@ use std::process::Command; use anyhow::Result; use camino::{Utf8Path, Utf8PathBuf}; +use cap_std::fs::Dir; +use cap_std_ext::cap_std; use cap_std_ext::rustix; use fn_error_context::context; use rustix::fd::AsFd; @@ -169,6 +171,17 @@ pub(crate) async fn run(opts: TestingOpts) -> Result<()> { TestingOpts::RunContainerIntegration {} => { tokio::task::spawn_blocking(impl_run_container).await? } + TestingOpts::CopySelfTo { target } => { + let target = Dir::open_ambient_dir(target, cap_std::ambient_authority())?; + let container_info = crate::containerenv::get_container_execution_info()?; + let srcdata = crate::install::SourceInfo::from_container(&container_info)?; + let did_override = + crate::install::reexecute_self_for_selinux_if_needed(&srcdata, false)?; + // Right now we don't expose an override flow + assert!(!did_override); + crate::systemtakeover::copy_self_to(&srcdata, &target).await?; + Ok(()) + } TestingOpts::PrepTestInstallFilesystem { blockdev } => { tokio::task::spawn_blocking(move || prep_test_install_filesystem(&blockdev).map(|_| ())) .await? diff --git a/lib/src/systemtakeover.rs b/lib/src/systemtakeover.rs new file mode 100644 index 000000000..e2a85e561 --- /dev/null +++ b/lib/src/systemtakeover.rs @@ -0,0 +1,341 @@ +//! This module contains logic to "take over" a system by moving the running +//! container into RAM, then switching root to it (becoming PID 1). +//! + +use std::fs::{File, OpenOptions}; +use std::io::{BufReader, Write}; +use std::process::Command; +use std::sync::Arc; +use std::{io::BufWriter, os::unix::process::CommandExt}; + +use anyhow::{Context, Result}; +use camino::Utf8Path; +use cap_std::fs::{Dir, DirBuilder}; +use cap_std_ext::cap_std; +use cap_std_ext::prelude::CapStdExtDirExt; +use cap_std_ext::rustix::fd::AsRawFd; +use cap_std_ext::rustix::process::getpid; +use fn_error_context::context; +use ostree_ext::container as ostree_container; +use ostree_ext::ostree; +use ostree_ext::ostree::gio; +use serde::{Deserialize, Serialize}; + +use crate::install::baseline::InstallBlockDeviceOpts; +use crate::install::{SourceInfo, State}; +use crate::task::Task; +use crate::utils::run_in_host_mountns; + +const BOOTC_RUNDIR: &str = "bootc"; +/// The system global path we move ourself into +const MEMORY_ROOT: &str = "tmp-root"; +/// The subdirectory of the memory root with the ostree repo +const SYSROOT: &str = "sysroot"; +/// The subdirectory of the memory root which has our new root +const ROOTDIR: &str = "rootfs"; +/// The file path in the root where we serialize the install options struct +const STATE_PATH: &str = "install-state.json"; + +/// The filesystem name we use as a hard link to know we're running as init +pub(crate) const BIN_NAME: &str = "bootc-install-from-memory"; + +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "kebab-case")] +pub(crate) struct SerializedState { + source_imageref: String, + source_digest: String, + config: crate::install::config::InstallConfiguration, + commit: String, + selinux: bool, +} + +impl SerializedState { + /// The file path in the root where we write SerializedState + const PATH: &str = "takeover-source.json"; +} + +/// Global state for takeover process +pub(crate) struct RunContext { + console: BufWriter, +} + +// This is somewhat similar to what we do for installing to a target root; +// pull the data from containers-storage and synthesize an ostree commit, +// but instead of doing a full deployment, just do a pull and then check out. +// A full deployment is like a checkout but also updates the bootloader config +// and has concepts like a "stateroot" etc. that we don't need here. +#[context("Copying self")] +pub(crate) async fn copy_self_to(source: &SourceInfo, target: &Dir) -> Result<()> { + use ostree_container::store::PrepareResult; + let cancellable = gio::Cancellable::NONE; + let self_imgref = ostree_container::OstreeImageReference { + // There are no signatures to verify since we're fetching the already + // pulled container. + sigverify: ostree_container::SignatureSource::ContainerPolicyAllowInsecure, + imgref: source.imageref.clone(), + }; + + tracing::debug!("Preparing import from {self_imgref}"); + // We need to fetch the container image from the root mount namespace + let proxy_cfg = crate::install::import_config_from_host(); + let repo_path = &Utf8Path::new(SYSROOT).join("repo"); + let mut db = DirBuilder::new(); + db.recursive(true); + target + .ensure_dir_with(repo_path, &db) + .with_context(|| format!("Creating {repo_path}"))?; + let repo = &ostree::Repo::create_at_dir(target, "repo", ostree::RepoMode::Bare, None) + .context("Creating repo")?; + repo.set_disable_fsync(true); + let mut imp = ostree_container::store::ImageImporter::new(&repo, &self_imgref, proxy_cfg) + .await + .context("Initializing importer")?; + let img = match imp.prepare().await? { + PrepareResult::AlreadyPresent(i) => i, + PrepareResult::Ready(r) => imp.import(r).await?, + }; + + let commit = img.get_commit(); + tracing::debug!("Imported {commit}"); + + target + .remove_all_optional(ROOTDIR) + .context("Cleaning up {ROOTDIR}")?; + + // We've imported the container as an ostree commit. + // Now check out the filesystem tree. + tokio::task::spawn_blocking({ + let repo = repo.clone(); + let target = target.try_clone()?; + let commit = commit.to_owned(); + let cancellable = cancellable.clone(); + move || { + let checkout_opts = ostree::RepoCheckoutAtOptions { + mode: ostree::RepoCheckoutMode::None, + no_copy_fallback: true, + force_copy_zerosized: true, + enable_fsync: false, + ..Default::default() + }; + tracing::debug!("Performing checkout"); + repo.checkout_at( + Some(&checkout_opts), + target.as_raw_fd(), + ROOTDIR, + &commit, + cancellable, + ) + } + }) + .await + .context("Performing checkout")??; + + // This special hardlink signals the main code that we're in takeover mode. + let rootdir = target.open_dir(ROOTDIR)?; + rootdir + .hard_link( + "usr/bin/bootc", + &rootdir, + Utf8Path::new("usr/bin").join(BIN_NAME), + ) + .with_context(|| format!("Hard linking to {BIN_NAME}"))?; + tracing::debug!("Checkout OK"); + + Ok(()) +} + +/// Prepare mounts in the target root before we switch +#[context("Setting up target root")] +fn setup_target_root(root: &Dir) -> Result<()> { + // Mount /sysroot in the target root so we can see the ostree repo + let target_sysroot = Utf8Path::new(ROOTDIR).join("sysroot"); + Task::new(format!("Bind mount /sysroot"), "mount") + .cwd(root)? + .args(["--bind", "sysroot", target_sysroot.as_str()]) + .run()?; + Ok(()) +} + +#[context("Re-executing to perform install from memory")] +pub(crate) async fn run_from_host(opts: InstallBlockDeviceOpts, state: Arc) -> Result<()> { + let host_runpath = super::install::HOST_RUNDIR; + let host_run = Dir::open_ambient_dir(host_runpath, cap_std::ambient_authority()) + .with_context(|| format!("Failed to open {host_runpath}"))?; + let global_rundir = host_run.open_dir("run").context("Opening host /run")?; + global_rundir + .create_dir_all(BOOTC_RUNDIR) + .with_context(|| format!("Creating {BOOTC_RUNDIR}"))?; + let rundir = global_rundir.open_dir(BOOTC_RUNDIR)?; + // Copy the container to /run/bootc/tmp-root + rundir + .create_dir_all(MEMORY_ROOT) + .with_context(|| format!("Creating {MEMORY_ROOT}"))?; + let target = rundir + .open_dir(MEMORY_ROOT) + .with_context(|| format!("Opening {MEMORY_ROOT}"))?; + tracing::debug!("Writing to {MEMORY_ROOT}"); + + copy_self_to(&state.source, &target).await?; + // Prepare mounts in the new temporary root + setup_target_root(&target)?; + tracing::debug!("Set up target root"); + + // Serialize the install data into /run so we can pick it up when we re-execute + rundir + .atomic_replace_with(STATE_PATH, move |w| { + serde_json::to_writer(w, &opts)?; + anyhow::Ok(()) + }) + .context("Writing serialized options")?; + // Serialize the container source into /run too + rundir + .atomic_replace_with(SerializedState::PATH, move |w| { + let state = SerializedState { + source_imageref: state.source.imageref.to_string(), + source_digest: state.source.digest.clone(), + config: state.install_config.clone(), + commit: state.source.commit.clone(), + selinux: state.source.selinux, + }; + serde_json::to_writer(w, &state)?; + anyhow::Ok(()) + }) + .context("Writing serialized state")?; + + if dbg!(std::env::var_os("bootc_exit").is_some()) { + return Ok(()); + } + + // Systemd tries to reload policy in the new root, which fails because policy is already + // loaded; we don't want that here. So for takeover installs, let's just set permissive mode. + crate::lsm::selinux_set_permissive()?; + tracing::debug!("Invoking systemctl switch-root"); + let abs_target_root = format!("/run/{BOOTC_RUNDIR}/{MEMORY_ROOT}/{ROOTDIR}"); + let bin = format!("/usr/bin/{BIN_NAME}"); + // Then, systemctl switch-root to our target root, re-executing our own + // binary. + // We will likely want to accept things like a log file path (and support remote logging) + // so that we can easily debug the install process if it fails in the middle. + Task::new_cmd( + "Invoking systemctl switch-root", + run_in_host_mountns("systemctl"), + ) + .args(["switch-root", abs_target_root.as_str(), bin.as_str()]) + .run()?; + + println!("Waiting for termination of this process..."); + // 5 minutes should be long enough for practical purposes + std::thread::sleep(std::time::Duration::from_secs(5 * 60)); + + anyhow::bail!("Failed to wait for systemctl switch-root"); +} + +async fn run_impl(ctx: &mut RunContext) -> Result<()> { + anyhow::ensure!(getpid().is_init()); + + let _ = writeln!(ctx.console, "bootc: Preparing takeover installation"); + + let global_rundir = Dir::open_ambient_dir("/run", cap_std::ambient_authority())?; + + // Deserialize our system state + let opts = { + let f = global_rundir + .open(STATE_PATH) + .map(BufReader::new) + .with_context(|| format!("Opening {STATE_PATH}"))?; + let mut opts: crate::install::InstallOpts = serde_json::from_reader(f)?; + // This must be set if we got here + anyhow::ensure!(opts.block_opts.takeover); + // But avoid infinite recursion =) + opts.block_opts.takeover = false; + opts + }; + let serialized_state: SerializedState = { + let f = global_rundir + .open(SerializedState::PATH) + .map(BufReader::new) + .with_context(|| format!("Opening {}", SerializedState::PATH))?; + serde_json::from_reader(f).context("Parsing serialized state")? + }; + + let state = State { + source: SourceInfo { + imageref: serialized_state.source_imageref.as_str().try_into()?, + digest: serialized_state.source_digest, + from_ostree_repo: true, + commit: serialized_state.commit, + selinux: serialized_state.selinux, + }, + override_disable_selinux: opts.config_opts.disable_selinux, + config_opts: opts.config_opts, + target_opts: opts.target_opts, + install_config: serialized_state.config, + }; + let state = Arc::new(state); + + // Now, we perform the install to the target block device. We should be + // pretty confident that all prior mounts were torn down (systemd should + // have done this, but see above for using `systemctl isolate` to help make + // sure. + + // TODO: In this model we already have the ostree commit pulled. Let's + // refactor the install code to also have a "from pre-pulled ostree commit" + // path too. + crate::install::install_takeover(opts.block_opts, state).await?; + Ok(()) +} + +/// There's not much we can do if something went wrong as pid 1. +/// Write the error to the console, then reboot. +fn handle_fatal_error(ctx: &mut RunContext, e: anyhow::Error) -> ! { + // Best effort to write output to the console + let _ = writeln!(ctx.console, "{e}"); + let _ = ctx.console.flush(); + + // There is definitely going to be something better to do here, + // but it entirely depends on at which point we fail. Roughly + // likely something like this: + // + // - Before we run sgdisk to rewrite the target blockdev: + // We can write to a log file that persists, the host should still + // be fine on the next boot. + // - After we've run sgdisk, but before the deployment completes: + // Perhaps try to rewrite the partition tables again and at least + // record the error to a temporary partition. (Or we could + // *always* log to a temporary partition, but then remove it just + // before rebooting?) + std::thread::sleep(std::time::Duration::from_secs(60)); + reboot() +} + +fn reboot() -> ! { + let e = Command::new("reboot").arg("-ff").exec(); + panic!("Failed to exec reboot: {e}"); +} + +// Because we're running as pid1, exiting will cause a kernel panic; so we don't. +pub(crate) async fn run() -> ! { + // At this point, we're running as pid1. We could fork ourself and run + // the real work as a child process if that helped things, but eh. + + let console = match OpenOptions::new() + .write(true) + .open("/dev/console") + .map(BufWriter::new) + { + Ok(c) => c, + Err(e) => { + panic!("Failed to open /dev/console: {e}") + } + }; + let mut ctx = RunContext { console }; + + match run_impl(&mut ctx).await { + Ok(()) => { + let _ = writeln!(ctx.console, "Rebooting"); + println!("Rebooting"); + reboot() + } + Err(e) => handle_fatal_error(&mut ctx, e), + } +}