Skip to content

Commit

Permalink
registry: lock start attempts
Browse files Browse the repository at this point in the history
When running parallel, multiple tests could be trying to start
the registry at once. Make this parallel-safe.

Also, use a safer port range for the registry. Something
outside of /proc/sys/net/ipv4/ip_local_port_range

Sorry, I'm including a FIXME section that I haven't investigated
deeply enough.

Signed-off-by: Ed Santiago <[email protected]>
  • Loading branch information
edsantiago committed Sep 17, 2024
1 parent bf61317 commit 5fc3de5
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 8 deletions.
39 changes: 32 additions & 7 deletions test/system/helpers.registry.bash
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,42 @@ unset REGISTRY_AUTH_FILE
# Start a local registry. Only needed on demand (e.g. by 150-login.bats)
# and then only once: if we start, leave it running until final teardown.
function start_registry() {
if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then
# Already started
AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth

local startflag=${PODMAN_LOGIN_WORKDIR}/OK

if ! mkdir $AUTHDIR; then
# *Possibly* already started. Or, possibly (when running
# parallel tests) another process is trying to start it.
# Give it some time.
local timeout=30
while [[ $timeout -gt 0 ]]; do
if [[ -e $startflag ]]; then
echo "Registry has already been started by another process"
return
fi

sleep 1
timeout=$((timeout - 1))
done

die "Internal error: timed out waiting for another process to start registry"

# Fixes very obscure corner case in root system tests:
# 1) we run 150-login tests, starting a registry; then
# 2) run 500-network, which runs iptables -F; then
# 3) run 700-play, the "private" test, which needs the
# already-started registry, but its port is now DROPped,
# so the test times out trying to talk to registry
run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all

###### FIXME FIXME FIXME TEMPORARY!
###### Trying to understand flake #23725. What happens if we stop
###### doing the network reload?
###### FIXME FIXME FIXME, should we do it in stop_registry??
###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
return
fi

AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
mkdir -p $AUTHDIR

# Registry image; copy of docker.io, but on our own registry
Expand Down Expand Up @@ -79,6 +101,9 @@ function start_registry() {
wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT}
# ...so we look in container logs for confirmation that registry is running.
_PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid

touch $startflag
echo "I have started the registry"
}

function stop_registry() {
Expand All @@ -103,10 +128,10 @@ function stop_registry() {
mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount

if [[ $(id -u) -eq 0 ]]; then
rm -rf ${PODMAN_LOGIN_WORKDIR}
rm -rf ${PODMAN_LOGIN_WORKDIR}/*
else
# rootless image data is owned by a subuid
run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}
run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/*
fi
fi

Expand All @@ -119,7 +144,7 @@ function stop_registry() {
echo ""
echo "lsof -i -P"
lsof -i -P
die "Socket still seems open"
die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open"
fi
}

Expand Down
2 changes: 1 addition & 1 deletion test/system/setup_suite.bash
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ function setup_suite() {

# FIXME: racy! It could be many minutes between now and when we start it.
# To mitigate, we use a range not used anywhere else in system tests.
export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999)
export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999)

# The above does not handle errors. Do a final confirmation.
assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \
Expand Down

0 comments on commit 5fc3de5

Please sign in to comment.