From 83520869cdeda8cac200fb74dcb55dbc6612b725 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 19 Dec 2024 09:56:21 +0100 Subject: [PATCH 1/2] Fix Qemu hang silently on failed boot Ticket: JIRA-344 Problem: When QEMU was failing to boot the hard drive file image provided by the user, for example we have cases of user using an ext4 image for firefracker instead of a qemu disk image (this was facilitated by an oversight in the typescript sdk), the qemu process and hence the controller would hang indefinetly without showing an error message. Analysis 1. the Boot process was not part of the logs or the process output. (even inside the server) which is part of what was making it hard to debug. 2. QEMU try to boot via the network even if it is useless 3. After failing all boot method the qemu process and thus the controller is still running indefinitely Solution: Change the option for qemu -nographics make it output the boot process on the standard output (and thus the logs) -boot order=c only boot the first hard drive (not sure if this actually work) -boot reboot-timeout=1 make it reboot if if fail to boot, but since we have -no-reboot the process just stop (default is -1 no reboot) --- src/aleph/vm/hypervisors/qemu/qemuvm.py | 7 +++++++ src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py index 5949fbdc..5bcb1313 100644 --- a/src/aleph/vm/hypervisors/qemu/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py @@ -102,6 +102,13 @@ async def start( # Tell to put the output to std fd, so we can include them in the log "-serial", "stdio", + # nographics. Seems redundant with -serial stdio but without it the boot process is not displayed on stdout + "-nographic", + # Boot + # order=c only first hard drive + # reboot-timeout in combination with -no-reboot, makes it so qemu stop if there is no bootable device + "-boot", + "order=c,reboot-timeout=1", # Uncomment for debug # "-serial", "telnet:localhost:4321,server,nowait", # "-snapshot", # Do not save anything to disk diff --git a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py index 85ca63c1..89e9c3e8 100644 --- a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py @@ -87,12 +87,18 @@ async def start( "-qmp", f"unix:{self.qmp_socket_path},server,nowait", # Tell to put the output to std fd, so we can include them in the log - "-nographic", "-serial", "stdio", - "--no-reboot", # Rebooting from inside the VM shuts down the machine - "-S", + # nographics. Seems redundant with -serial stdio but without it the boot process is not displayed on stdout + "-nographic", + # Boot + # order=c only first hard drive + # reboot-timeout in combination with -no-reboot, makes it so qemu stop if there is no bootable device + "-boot", + "order=c,reboot-timeout=1", # Confidential options + # Do not start CPU at startup, we will start it via QMP after injecting the secret + "-S", "-object", f"sev-guest,id=sev0,policy={self.sev_policy},cbitpos={sev_info.c_bit_position}," f"reduced-phys-bits={sev_info.phys_addr_reduction}," From b43300b93c3078ca32b00939ff9533f1fc220916 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 19 Dec 2024 10:01:16 +0100 Subject: [PATCH 2/2] Problem Default log level in controler was not set --- src/aleph/vm/controllers/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aleph/vm/controllers/__main__.py b/src/aleph/vm/controllers/__main__.py index 519270b4..f3cef317 100644 --- a/src/aleph/vm/controllers/__main__.py +++ b/src/aleph/vm/controllers/__main__.py @@ -53,6 +53,7 @@ def parse_args(args): help="set loglevel to DEBUG", action="store_const", const=logging.DEBUG, + default=logging.INFO, ) return parser.parse_args(args)