From 23adfa53684d1923d198af7962c18ac648ab6457 Mon Sep 17 00:00:00 2001
From: Aashish Radhakrishnan <aaradhak@redhat.com>
Date: Tue, 12 Nov 2024 15:12:38 -0500
Subject: [PATCH] mantle/kola: Add function to enhance upgrade stability

This commit introduces the `waitForUpgradeToBeStaged` function to
improve the stability of kola upgrade test by reducing timeout-related
failures.
The new function sets up a systemd path unit to monitor updates in the
`/ostree/repo/refs/heads/ostree/1/1` directory, triggering a stop on
`wait.service` once changes are detected.
By ensuring we wait later in the upgrade process, we minimize the
waiting period in `runFnAndWaitForRebootIntoVersion`, focusing
only on the actual reboot phase.

Author : Dusty Mabe <dusty@dustymabe.com>
Ref: https://github.com/coreos/fedora-coreos-tracker/issues/1805
---
 mantle/kola/tests/upgrade/basic.go | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/mantle/kola/tests/upgrade/basic.go b/mantle/kola/tests/upgrade/basic.go
index a3404c9c29..2267bca161 100644
--- a/mantle/kola/tests/upgrade/basic.go
+++ b/mantle/kola/tests/upgrade/basic.go
@@ -313,10 +313,31 @@ func runFnAndWaitForRebootIntoVersion(c cluster.TestCluster, m platform.Machine,
 	}
 }
 
+func waitForUpgradeToBeStaged(c cluster.TestCluster, m platform.Machine) {
+       // Here we set up a systemd path unit to watch for when ostree
+       // behind the scenes updates the refs in the repo under the
+       // /ostree/repo/refs/heads/ostree/1/1 directory. refchanged.path
+       // will trigger when it gets updated and will then stop wait.service.
+       // The systemd-run --wait causes it to not return here (and thus
+       // continue execution of code here) until wait.service has been
+       // stopped by refchanged.service. This is an effort to make us
+       // start waiting inside runFnAndWaitForRebootIntoVersion until
+       // later in the upgrade process because we are seeing failures due
+       // to timeouts and we're trying to reduce the variability by
+       // minimizing the wait inside that function to just the actual reboot.
+       // https://github.com/coreos/fedora-coreos-tracker/issues/1805
+       //
+       // Note: if systemd-run ever gains the ability to --wait when
+       //       generating a path unit then the below can be simplified.
+       c.RunCmdSync(m, "sudo systemd-run -u refchanged --path-property=PathChanged=/ostree/repo/refs/heads/ostree/1/1 systemctl stop wait.service")
+       c.RunCmdSync(m, "sudo systemd-run --wait -u wait sleep infinity")
+}
+
 func waitForUpgradeToVersion(c cluster.TestCluster, m platform.Machine, version string) {
 	runFnAndWaitForRebootIntoVersion(c, m, version, func() {
 		// Start Zincati so it will apply the update
 		c.RunCmdSync(m, "sudo systemctl start zincati.service")
+		waitForUpgradeToBeStaged(c, m)
 	})
 }
 
@@ -328,6 +349,7 @@ func rpmostreeRebase(c cluster.TestCluster, m platform.Machine, ref, version str
 		// we use systemd-run here so that we can test the --reboot path
 		// without having SSH not exit cleanly, which would cause an error
 		c.RunCmdSyncf(m, "sudo systemd-run rpm-ostree rebase --reboot %s", ref)
+		waitForUpgradeToBeStaged(c, m)
 	})
 }