diff --git a/Makefile-boot.am b/Makefile-boot.am
index 90f9804834..c07b6b8123 100644
--- a/Makefile-boot.am
+++ b/Makefile-boot.am
@@ -42,6 +42,7 @@ systemdsystemunit_DATA = src/boot/ostree-prepare-root.service \
src/boot/ostree-finalize-staged.service \
src/boot/ostree-finalize-staged.path \
src/boot/ostree-finalize-staged-hold.service \
+ src/boot/ostree-state-overlay@.service \
$(NULL)
systemdtmpfilesdir = $(prefix)/lib/tmpfiles.d
dist_systemdtmpfiles_DATA = src/boot/ostree-tmpfiles.conf
@@ -72,6 +73,7 @@ EXTRA_DIST += src/boot/dracut/module-setup.sh \
src/boot/ostree-remount.service \
src/boot/ostree-finalize-staged.service \
src/boot/ostree-finalize-staged-hold.service \
+ src/boot/ostree-state-overlay@.service \
src/boot/grub2/grub2-15_ostree \
src/boot/grub2/ostree-grub-generator \
$(NULL)
diff --git a/Makefile-ostree.am b/Makefile-ostree.am
index ade079c976..d2447ffe9d 100644
--- a/Makefile-ostree.am
+++ b/Makefile-ostree.am
@@ -85,6 +85,7 @@ ostree_SOURCES += \
src/ostree/ot-admin-builtin-post-copy.c \
src/ostree/ot-admin-builtin-upgrade.c \
src/ostree/ot-admin-builtin-unlock.c \
+ src/ostree/ot-admin-builtin-state-overlay.c \
src/ostree/ot-admin-builtins.h \
src/ostree/ot-admin-instutil-builtin-selinux-ensure-labeled.c \
src/ostree/ot-admin-instutil-builtin-set-kargs.c \
diff --git a/src/boot/ostree-state-overlay@.service b/src/boot/ostree-state-overlay@.service
new file mode 100644
index 0000000000..dc8aeac51b
--- /dev/null
+++ b/src/boot/ostree-state-overlay@.service
@@ -0,0 +1,36 @@
+# Copyright (C) 2023 Red Hat Inc.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see .
+
+[Unit]
+Description=OSTree State Overlay On /%I
+Documentation=man:ostree(1)
+DefaultDependencies=no
+ConditionKernelCommandLine=ostree
+# run after /var is setup since that's where the upperdir is stored
+# and after boot.mount so we can load the sysroot
+After=var.mount boot.mount
+# but before local-fs.target, which we consider ourselves a part of
+Before=local-fs.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/usr/bin/ostree admin state-overlay %i /%I
+StandardInput=null
+StandardOutput=journal
+StandardError=journal+console
+
+[Install]
+WantedBy=local-fs.target
diff --git a/src/ostree/ot-admin-builtin-state-overlay.c b/src/ostree/ot-admin-builtin-state-overlay.c
new file mode 100644
index 0000000000..64380bb0f2
--- /dev/null
+++ b/src/ostree/ot-admin-builtin-state-overlay.c
@@ -0,0 +1,243 @@
+/* Copyright (C) 2023 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: LGPL-2.0+
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see .
+ */
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#include "glnx-errors.h"
+#include "glnx-fdio.h"
+#include "glnx-local-alloc.h"
+#include "glnx-shutil.h"
+#include "glnx-xattrs.h"
+#include "ot-admin-builtins.h"
+
+#define OSTREE_STATEOVERLAYS_DIR "/var/ostree/state-overlays"
+#define OSTREE_STATEOVERLAY_UPPER_DIR "upper"
+#define OSTREE_STATEOVERLAY_WORK_DIR "work"
+
+/* https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html */
+#define OVERLAYFS_DIR_XATTR_OPAQUE "trusted.overlay.opaque"
+
+static GOptionEntry options[] = { { NULL } };
+
+static gboolean
+ensure_overlay_dirs (const char *overlay_dir, int *out_overlay_dfd, GCancellable *cancellable,
+ GError **error)
+{
+ glnx_autofd int overlay_dfd = -1;
+ if (!glnx_shutil_mkdir_p_at_open (AT_FDCWD, overlay_dir, 0755, &overlay_dfd, cancellable, error))
+ return FALSE;
+
+ if (!glnx_shutil_mkdir_p_at (overlay_dfd, OSTREE_STATEOVERLAY_WORK_DIR, 0755, cancellable, error))
+ return FALSE;
+ if (!glnx_shutil_mkdir_p_at (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, 0755, cancellable,
+ error))
+ return FALSE;
+
+ *out_overlay_dfd = glnx_steal_fd (&overlay_dfd);
+ return TRUE;
+}
+
+static gboolean
+is_opaque_dir (int dfd, const char *dname, gboolean *out_is_opaque, GError **error)
+{
+ /* XXX: this is basically like a `glnx_lgetxattrat_allow_noent()`; upstream it */
+
+ char pathbuf[PATH_MAX];
+ snprintf (pathbuf, sizeof (pathbuf), "/proc/self/fd/%d/%s", dfd, dname);
+
+ ssize_t bytes_read, real_size;
+ if (TEMP_FAILURE_RETRY (bytes_read = lgetxattr (pathbuf, OVERLAYFS_DIR_XATTR_OPAQUE, NULL, 0))
+ < 0)
+ {
+ if (errno != ENODATA)
+ return glnx_throw_errno_prefix (error, "lgetxattr(%s)", OVERLAYFS_DIR_XATTR_OPAQUE);
+ *out_is_opaque = FALSE;
+ return TRUE;
+ }
+
+ g_autofree guint8 *buf = g_malloc (bytes_read);
+ if (TEMP_FAILURE_RETRY (real_size
+ = lgetxattr (pathbuf, OVERLAYFS_DIR_XATTR_OPAQUE, buf, bytes_read))
+ < 0)
+ return glnx_throw_errno_prefix (error, "lgetxattr(%s)", OVERLAYFS_DIR_XATTR_OPAQUE);
+
+ *out_is_opaque = (real_size == 1 && buf[0] == 'y');
+ return TRUE;
+}
+
+static gboolean
+prune_upperdir_recurse (int lower_dfd, int upper_dfd, GCancellable *cancellable, GError **error)
+{
+ g_auto (GLnxDirFdIterator) dfd_iter = { 0 };
+ if (!glnx_dirfd_iterator_init_at (upper_dfd, ".", FALSE, &dfd_iter, error))
+ return FALSE;
+
+ while (TRUE)
+ {
+ struct dirent *dent = NULL;
+ if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
+ return FALSE;
+ if (dent == NULL)
+ break;
+
+ /* do we have an entry of the same name in the lowerdir? */
+ struct stat stbuf;
+ if (!glnx_fstatat_allow_noent (lower_dfd, dent->d_name, &stbuf, AT_SYMLINK_NOFOLLOW, error))
+ return FALSE;
+ if (errno == ENOENT)
+ continue; /* state file (i.e. upperdir only); carry on */
+
+ /* ok, it shadows; are they both directories? */
+ if (dent->d_type == DT_DIR && S_ISDIR (stbuf.st_mode))
+ {
+ /* is the directory opaque? */
+ gboolean is_opaque = FALSE;
+ if (!is_opaque_dir (upper_dfd, dent->d_name, &is_opaque, error))
+ return FALSE;
+
+ if (!is_opaque)
+ {
+ /* recurse */
+ glnx_autofd int lower_subdfd = -1;
+ if (!glnx_opendirat (lower_dfd, dent->d_name, FALSE, &lower_subdfd, error))
+ return FALSE;
+ glnx_autofd int upper_subdfd = -1;
+ if (!glnx_opendirat (upper_dfd, dent->d_name, FALSE, &upper_subdfd, error))
+ return FALSE;
+ if (!prune_upperdir_recurse (lower_subdfd, upper_subdfd, cancellable, error))
+ return glnx_prefix_error (error, "in %s", dent->d_name);
+
+ continue;
+ }
+
+ /* fallthrough; implicitly delete opaque directories */
+ }
+
+ /* any other case, we prune (this also implicitly covers whiteouts and opaque dirs) */
+ if (dent->d_type == DT_DIR)
+ {
+ if (!glnx_shutil_rm_rf_at (upper_dfd, dent->d_name, cancellable, error))
+ return FALSE;
+ }
+ /* just unlinkat(); saves one openat() call */
+ else if (!glnx_unlinkat (upper_dfd, dent->d_name, 0, error))
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static gboolean
+prune_upperdir (int sysroot_fd, const char *deployment_path, const char *mountpath, int overlay_dfd,
+ GCancellable *cancellable, GError **error)
+{
+ glnx_autofd int lower_dfd = -1;
+ if (!glnx_opendirat (AT_FDCWD, mountpath, FALSE, &lower_dfd, error))
+ return FALSE;
+
+ glnx_autofd int upper_dfd = -1;
+ if (!glnx_opendirat (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, FALSE, &upper_dfd, error))
+ return FALSE;
+
+ if (!prune_upperdir_recurse (lower_dfd, upper_dfd, cancellable, error))
+ return FALSE;
+
+ /* touch upperdir to mark prune as completed */
+ if (utimensat (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, NULL, 0) < 0)
+ return glnx_throw_errno_prefix (error, "futimens(upper)");
+
+ return TRUE;
+}
+
+static gboolean
+mount_overlay (const char *mountpath, const char *name, GError **error)
+{
+ /* we could use /proc/self/... with overlay_dfd to avoid these allocations,
+ * but this gets stringified into the options field in the mount table, and
+ * being cryptic is not helpful */
+ g_autofree char *upperdir
+ = g_build_filename (OSTREE_STATEOVERLAYS_DIR, name, OSTREE_STATEOVERLAY_UPPER_DIR, NULL);
+ g_autofree char *workdir
+ = g_build_filename (OSTREE_STATEOVERLAYS_DIR, name, OSTREE_STATEOVERLAY_WORK_DIR, NULL);
+ g_autofree char *ovl_options
+ = g_strdup_printf ("lowerdir=%s,upperdir=%s,workdir=%s", mountpath, upperdir, workdir);
+ if (mount ("overlay", mountpath, "overlay", MS_SILENT, ovl_options) < 0)
+ return glnx_throw_errno_prefix (error, "mount(%s)", mountpath);
+
+ return TRUE;
+}
+
+/* Called by ostree-state-overlay@.service. */
+gboolean
+ot_admin_builtin_state_overlay (int argc, char **argv, OstreeCommandInvocation *invocation,
+ GCancellable *cancellable, GError **error)
+{
+ g_autoptr (GOptionContext) context = g_option_context_new ("NAME MOUNTPATH");
+ g_autoptr (OstreeSysroot) sysroot = NULL;
+
+ /* First parse the args without loading the sysroot to see what options are
+ * set. */
+ if (!ostree_admin_option_context_parse (context, options, &argc, &argv,
+ OSTREE_ADMIN_BUILTIN_FLAG_NONE, invocation, &sysroot,
+ cancellable, error))
+ return FALSE;
+
+ if (argc < 3)
+ return glnx_throw (error, "Missing NAME or MOUNTPATH");
+
+ /* Sanity-check */
+ OstreeDeployment *booted_deployment = ostree_sysroot_get_booted_deployment (sysroot);
+ if (booted_deployment == NULL)
+ return glnx_throw (error, "Must be booted into an OSTree deployment");
+
+ const char *overlay_name = argv[1];
+ const char *mountpath = argv[2];
+
+ glnx_autofd int overlay_dfd = -1;
+ g_autofree char *overlay_dir = g_build_filename (OSTREE_STATEOVERLAYS_DIR, overlay_name, NULL);
+ if (!ensure_overlay_dirs (overlay_dir, &overlay_dfd, cancellable, error))
+ return FALSE;
+
+ struct stat stbuf_upper;
+ if (!glnx_fstatat (overlay_dfd, OSTREE_STATEOVERLAY_UPPER_DIR, &stbuf_upper, 0, error))
+ return FALSE;
+
+ /* We don't use "/" directly here because that may have e.g. an overlay
+ * slapped on from root.transient or composefs. */
+ g_autofree char *deployment_path
+ = ostree_sysroot_get_deployment_dirpath (sysroot, booted_deployment);
+ struct stat stbuf_lower;
+ if (!glnx_fstatat (ostree_sysroot_get_fd (sysroot), deployment_path, &stbuf_lower, 0, error))
+ return FALSE;
+
+ if (stbuf_upper.st_mtime < stbuf_lower.st_mtime)
+ {
+ /* the lowerdir was updated; prune the upperdir */
+ if (!prune_upperdir (ostree_sysroot_get_fd (sysroot), deployment_path, mountpath, overlay_dfd,
+ cancellable, error))
+ return glnx_prefix_error (error, "Pruning upperdir for %s", overlay_name);
+ }
+
+ return mount_overlay (mountpath, overlay_name, error);
+}
diff --git a/src/ostree/ot-admin-builtins.h b/src/ostree/ot-admin-builtins.h
index 1775384e17..cd1472bf3d 100644
--- a/src/ostree/ot-admin-builtins.h
+++ b/src/ostree/ot-admin-builtins.h
@@ -50,6 +50,7 @@ BUILTINPROTO (upgrade);
BUILTINPROTO (kargs);
BUILTINPROTO (post_copy);
BUILTINPROTO (lock_finalization);
+BUILTINPROTO (state_overlay);
#undef BUILTINPROTO
diff --git a/src/ostree/ot-builtin-admin.c b/src/ostree/ot-builtin-admin.c
index 35a1e115c1..68a54751f0 100644
--- a/src/ostree/ot-builtin-admin.c
+++ b/src/ostree/ot-builtin-admin.c
@@ -42,6 +42,8 @@ static OstreeCommand admin_subcommands[] = {
"Change the finalization locking state of the staged deployment" },
{ "boot-complete", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN,
ot_admin_builtin_boot_complete, "Internal command to run at boot after an update was applied" },
+ { "state-overlay", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN,
+ ot_admin_builtin_state_overlay, "Internal command to assemble a state overlay" },
{ "init-fs", OSTREE_BUILTIN_FLAG_NO_REPO, ot_admin_builtin_init_fs,
"Initialize a root filesystem" },
{ "instutil", OSTREE_BUILTIN_FLAG_NO_REPO | OSTREE_BUILTIN_FLAG_HIDDEN, ot_admin_builtin_instutil,
diff --git a/tests/kolainst/destructive/state-overlay.sh b/tests/kolainst/destructive/state-overlay.sh
new file mode 100755
index 0000000000..4442611e9f
--- /dev/null
+++ b/tests/kolainst/destructive/state-overlay.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+set -xeuo pipefail
+
+. ${KOLA_EXT_DATA}/libinsttest.sh
+
+case "${AUTOPKGTEST_REBOOT_MARK:-}" in
+ "")
+ # create a new ostree commit with some toplevel content
+ mkdir -p /var/tmp/rootfs/foobar
+ (cd /var/tmp/rootfs/foobar
+ touch an_empty_file
+ echo 'foobar' > a_non_empty_file
+ echo 'foobar' > another_file
+ ln -s an_empty_file a_working_symlink
+ ln -s enoent a_broken_symlink
+ mkdir an_empty_subdir
+ mkdir a_nonempty_subdir
+ echo foobar > a_nonempty_subdir/foobar
+ mkdir -p a_deeply/deeply/nested/subdir
+ echo foobar > a_deeply/deeply/nested/subdir/foobar
+
+ # test content deletion
+ mkdir a_dir_to_delete
+ touch a_file_to_delete
+ ln -s enoent a_symlink_to_delete
+
+ # opaque directory
+ mkdir a_dir_to_make_opaque
+ touch a_dir_to_make_opaque/base
+ )
+
+ ostree commit --no-bindings -P -b foobar --tree=ref="${host_commit}" --tree=dir=/var/tmp/rootfs
+ rpm-ostree rebase :foobar
+ systemctl enable ostree-state-overlay@foobar.service
+ /tmp/autopkgtest-reboot "2"
+ ;;
+ "2")
+ if ! test -d /foobar; then
+ fatal "no /foobar toplevel dir"
+ fi
+ if [[ $(findmnt /foobar -no SOURCE) != overlay ]]; then
+ fatal "/foobar is not overlay"
+ fi
+
+ cd /foobar
+
+ # create some state files (i.e. not shadowing)
+ echo "state" > state
+ echo "state" > a_nonempty_subdir/state
+ echo "state" > a_deeply/deeply/nested/subdir/state
+ ln -s foobar state_symlink
+ mkdir state_dir
+
+ # and shadow some base files
+
+ # make empty file non-empty
+ echo shadow > an_empty_file
+ # make a file become a directory
+ rm a_non_empty_file && mkdir a_non_empty_file
+ # make a file become a symlink
+ ln -sf some_target another_file
+ # override a working symlink
+ ln -sf another_file a_working_symlink
+ # override a non-working symlink
+ ln -sf enoent2 a_broken_symlink
+ # make dir become a file
+ rmdir an_empty_subdir
+ touch an_empty_subdir
+ # override file in a shallow subdir
+ echo shadow > a_nonempty_subdir/foobar
+ # override file in a deep subdir
+ echo shadow > a_deeply/deeply/nested/subdir/foobar
+ # delete some base files
+ rmdir a_dir_to_delete
+ rm a_file_to_delete
+ rm a_symlink_to_delete
+ # opaque directory
+ rm -rf a_dir_to_make_opaque
+ mkdir a_dir_to_make_opaque
+ touch a_dir_to_make_opaque/state
+
+ # check that rebooting without upgrading maintains state
+ /tmp/autopkgtest-reboot "3"
+ ;;
+ "3")
+ cd /foobar
+
+ # check state is still there
+ assert_file_has_content state state
+ assert_file_has_content a_nonempty_subdir/state state
+ assert_file_has_content a_deeply/deeply/nested/subdir/state state
+ [[ $(readlink state_symlink) == foobar ]]
+ test -d state_dir
+
+ # check shadowings
+ assert_file_has_content an_empty_file shadow
+ test -d a_non_empty_file
+ [[ $(readlink another_file) == some_target ]]
+ [[ $(readlink a_working_symlink) == another_file ]]
+ [[ $(readlink a_broken_symlink) == enoent2 ]]
+ test -f an_empty_subdir
+ assert_file_has_content a_nonempty_subdir/foobar shadow
+ assert_file_has_content a_deeply/deeply/nested/subdir/foobar shadow
+ ! test -e a_dir_to_delete
+ ! test -e a_file_to_delete
+ ! test -e a_symlink_to_delete
+ # opaque directory
+ test -d a_dir_to_make_opaque
+ ! test -e a_dir_to_make_opaque/base
+ test -e a_dir_to_make_opaque/state
+
+ # now reboot into an upgrade
+ ostree commit --no-bindings -P -b foobar --tree=ref="${host_commit}"
+ rpm-ostree upgrade
+ /tmp/autopkgtest-reboot "4"
+ ;;
+ "4")
+ cd /foobar
+
+ # check state is still there
+ assert_file_has_content state state
+ assert_file_has_content a_nonempty_subdir/state state
+ assert_file_has_content a_deeply/deeply/nested/subdir/state state
+ [[ $(readlink state_symlink) == foobar ]]
+ test -d state_dir
+
+ # check shadowings are gone
+ test -f an_empty_file
+ assert_file_has_content a_non_empty_file foobar
+ assert_file_has_content another_file foobar
+ [[ $(readlink a_working_symlink) == an_empty_file ]]
+ [[ $(readlink a_broken_symlink) == enoent ]]
+ test -d an_empty_subdir
+ test -d a_nonempty_subdir
+ assert_file_has_content a_nonempty_subdir/foobar foobar
+ assert_file_has_content a_deeply/deeply/nested/subdir/foobar foobar
+ test -d a_dir_to_delete
+ test -f a_file_to_delete
+ test -L a_symlink_to_delete
+ # opaque directory
+ test -d a_dir_to_make_opaque
+ test -e a_dir_to_make_opaque/base
+ ! test -e a_dir_to_make_opaque/state
+ ;;
+ *) fatal "Unexpected AUTOPKGTEST_REBOOT_MARK=${AUTOPKGTEST_REBOOT_MARK}" ;;
+esac