From d7fbf9b83a856351174abf0828fc154841260ba9 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 11 Oct 2024 14:47:37 +0200 Subject: [PATCH] fix(ci): epoll() on pidfd to wait for Firecracker exit Currently, we use psutil.pid_exists in a loop with a timeout of 10 seconds. This is racy and indeed some times we hit it in our CI. Substitute this mechanism with calling epoll() on the pidfd of the process instead. This should deterministically block until the process exits. If there's something else wrong, we will hit the pytest timeout. Signed-off-by: Babis Chalios --- tests/framework/utils.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/tests/framework/utils.py b/tests/framework/utils.py index b44676dc7a3..d154b7af249 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -7,6 +7,7 @@ import os import platform import re +import select import signal import stat import subprocess @@ -450,16 +451,39 @@ def run_guest_cmd(ssh_connection, cmd, expected, use_json=False): assert stdout == expected -@retry(wait=wait_fixed(1), stop=stop_after_attempt(10), reraise=True) +def get_process_pidfd(pid): + """Get a pidfd file descriptor for the process with PID `pid` + + Will return a pid file descriptor for the process with PID `pid` if it is + still alive. If the process has already exited it will return `None`. + + Any other error while calling the system call, will raise an OSError + exception. + """ + try: + pidfd = os.pidfd_open(pid) + except ProcessLookupError: + return None + + return pidfd + + def wait_process_termination(p_pid): """Wait for a process to terminate. - Will return sucessfully if the process + Will return successfully if the process got indeed killed or raises an exception if the process is still alive after retrying several times. """ - if psutil.pid_exists(p_pid): - raise Exception(f"[{p_pid}] process is still alive") + pidfd = get_process_pidfd(p_pid) + + # If pidfd is None the process has already terminated + if pidfd is not None: + epoll = select.epoll() + epoll.register(pidfd, select.EPOLLIN) + # This will return once the process exits + epoll.poll() + os.close(pidfd) def get_firecracker_version_from_toml():