From 5fc3de55837be1849b33b41490e56dd8e3835794 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 10:06:34 -0600
Subject: [PATCH] registry: lock start attempts

When running parallel, multiple tests could be trying to start
the registry at once. Make this parallel-safe.

Also, use a safer port range for the registry. Something
outside of /proc/sys/net/ipv4/ip_local_port_range

Sorry, I'm including a FIXME section that I haven't investigated
deeply enough.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/helpers.registry.bash | 39 +++++++++++++++++++++++++------
 test/system/setup_suite.bash      |  2 +-
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/test/system/helpers.registry.bash b/test/system/helpers.registry.bash
index 643a0e5151..ff57ad8b87 100644
--- a/test/system/helpers.registry.bash
+++ b/test/system/helpers.registry.bash
@@ -17,8 +17,26 @@ unset REGISTRY_AUTH_FILE
 # Start a local registry. Only needed on demand (e.g. by 150-login.bats)
 # and then only once: if we start, leave it running until final teardown.
 function start_registry() {
-    if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then
-        # Already started
+    AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
+
+    local startflag=${PODMAN_LOGIN_WORKDIR}/OK
+
+    if ! mkdir $AUTHDIR; then
+        # *Possibly* already started. Or, possibly (when running
+        # parallel tests) another process is trying to start it.
+        # Give it some time.
+        local timeout=30
+        while [[ $timeout -gt 0 ]]; do
+            if [[ -e $startflag ]]; then
+                echo "Registry has already been started by another process"
+                return
+            fi
+
+            sleep 1
+            timeout=$((timeout - 1))
+        done
+
+        die "Internal error: timed out waiting for another process to start registry"
 
         # Fixes very obscure corner case in root system tests:
         #  1) we run 150-login tests, starting a registry; then
@@ -26,11 +44,15 @@ function start_registry() {
         #  3) run 700-play, the "private" test, which needs the
         #     already-started registry, but its port is now DROPped,
         #     so the test times out trying to talk to registry
-        run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
+
+        ###### FIXME FIXME FIXME TEMPORARY!
+        ###### Trying to understand flake #23725. What happens if we stop
+        ###### doing the network reload?
+        ###### FIXME FIXME FIXME, should we do it in stop_registry??
+        ###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
         return
     fi
 
-    AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
     mkdir -p $AUTHDIR
 
     # Registry image; copy of docker.io, but on our own registry
@@ -79,6 +101,9 @@ function start_registry() {
     wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT}
     # ...so we look in container logs for confirmation that registry is running.
     _PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid
+
+    touch $startflag
+    echo "I have started the registry"
 }
 
 function stop_registry() {
@@ -103,10 +128,10 @@ function stop_registry() {
         mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount
 
         if [[ $(id -u) -eq 0 ]]; then
-            rm -rf ${PODMAN_LOGIN_WORKDIR}
+            rm -rf ${PODMAN_LOGIN_WORKDIR}/*
         else
             # rootless image data is owned by a subuid
-            run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}
+            run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/*
         fi
     fi
 
@@ -119,7 +144,7 @@ function stop_registry() {
         echo ""
         echo "lsof -i -P"
         lsof -i -P
-        die "Socket still seems open"
+        die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open"
     fi
 }
 
diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash
index 8a3f910839..fa7169a127 100644
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@@ -25,7 +25,7 @@ function setup_suite() {
 
     # FIXME: racy! It could be many minutes between now and when we start it.
     # To mitigate, we use a range not used anywhere else in system tests.
-    export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999)
+    export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999)
 
     # The above does not handle errors. Do a final confirmation.
     assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \