Skip to content

Fix Qemu hang silently on failed boot #1450

Fix Qemu hang silently on failed boot

Fix Qemu hang silently on failed boot #1450

---
# These are end-to-end tests running on ephemeral DigitalOcean "Droplet" virtual machines
# with the different operating systems that are supported.
#
# The main focus of these tests is to ensure that the packaging works on all supported platforms
# and to ensure the compatibility of dependencies (system and vendored) across these platforms.
name: "Testing on DigitalOcean Droplets"
# Run automatically on main branches, Pull Request updates and allow manual execution using `workflow_dispatch`.
on:
push:
branches:
- main
pull_request:
types:
- "opened"
- "reopened"
- "synchronize"
- "ready_for_review"
workflow_dispatch:
jobs:
run_on_droplet:
name: "Test Droplet with ${{ matrix.os_config.os_name }}-${{ matrix.check_vm.alias\
\ }}"
runs-on: ubuntu-latest
concurrency: "${{ matrix.os_config.concurrency_group }}-${{ matrix.check_vm.alias\
\ }}"
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
# Check compatibility with all supported OSes.
os_config:
- os_name: "Debian 12"
os_image: "debian-12-x64"
alias: "debian-12"
package_build_command: "all-podman-debian-12"
package_name: "aleph-vm.debian-12.deb"
concurrency_group: "droplet-aleph-vm-debian-12"
- os_name: "Ubuntu 22.04"
os_image: "ubuntu-22-04-x64"
alias: "ubuntu-22-04"
package_build_command: "all-podman-ubuntu-2204"
package_name: "aleph-vm.ubuntu-22.04.deb"
concurrency_group: "droplet-aleph-vm-ubuntu-22-04"
- os_name: "Ubuntu 24.04"
os_image: "ubuntu-24-04-x64"
alias: "ubuntu-24-04"
package_build_command: "all-podman-ubuntu-2404"
package_name: "aleph-vm.ubuntu-24.04.deb"
concurrency_group: "droplet-aleph-vm-ubuntu-24-04"
# Check compatibility with all supported runtimes.
check_vm:
- alias: "runtime-6770" # Old runtime, using Debian 11
item_hash: "67705389842a0a1b95eaa408b009741027964edc805997475e95c505d642edd8"
query_params: "?retro-compatibility=true"
- alias: "runtime-3fc0" # Newer runtime, using Debian 12 but now old SDK
item_hash: "3fc0aa9569da840c43e7bd2033c3c580abb46b007527d6d20f2d4e98e867f7af"
query_params: "?retro-compatibility=true"
- alias: "runtime-63fa" # Latest runtime, using Debian 12 and SDK 0.9.0
item_hash: "63faf8b5db1cf8d965e6a464a0cb8062af8e7df131729e48738342d956f29ace"
query_params: ""
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: true
- name: Install doctl
uses: digitalocean/action-doctl@v2
with:
token: ${{ secrets.DIGITALOCEAN_ACCESS_TOKEN }}
- name: Setup SSH private key
run: |
mkdir ~/.ssh
echo $DIGITALOCEAN_SSH_PRIVATE_KEY | base64 --decode > ~/.ssh/id_ed25519
chmod 0700 ~/.ssh
chmod 0600 ~/.ssh/id_ed25519
env:
DIGITALOCEAN_SSH_PRIVATE_KEY: ${{ secrets.DIGITALOCEAN_SSH_PRIVATE_KEY }}
- name: Create the Droplet
run: |
doctl compute droplet create \
--image ${{ matrix.os_config.os_image }} \
--size c-4 \
--region ams3 \
--vpc-uuid 5976b7bd-4417-49e8-8522-672aaa920c30 \
--enable-ipv6 \
--ssh-keys ab:2b:25:16:46:6f:25:d0:80:63:e5:be:67:04:cb:64 \
aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }}
- name: Build Package
run: |
cd packaging && make ${{ matrix.os_config.package_build_command }} && cd ..
ls packaging/target
- name: Get droplet ip and export it in env
run: |
echo "DROPLET_IPV4=$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)" >> "$GITHUB_ENV"
- name: Wait for the system to setup and boot
run: |
until ssh-keyscan -H ${DROPLET_IPV4}; do sleep 1; done
timeout-minutes: 3
- name: Install Aleph-VM on the Droplet
run: |
ssh-keyscan -H ${DROPLET_IPV4} > ~/.ssh/known_hosts
# Configuration
echo ALEPH_VM_SUPERVISOR_HOST=0.0.0.0 >> supervisor.env
echo ALEPH_VM_ALLOCATION_TOKEN_HASH=9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08 >> supervisor.env
echo ALEPH_VM_CHECK_FASTAPI_VM_ID=${{ matrix.check_vm.item_hash }} >> supervisor.env
echo ALEPH_VM_SENTRY_DSN=${{ secrets.SENTRY_DSN }} >> supervisor.env
ssh root@${DROPLET_IPV4} mkdir -p /etc/aleph-vm/
scp supervisor.env root@${DROPLET_IPV4}:/etc/aleph-vm/supervisor.env
# Wait a few seconds for DigitalOcean to setup the Droplet using apt, which conflicts with our comands:
sleep 5
# Wait for /var/lib/apt/lists/lock to be unlocked on the remote host via SSH.
while ssh root@${DROPLET_IPV4} lsof /var/lib/apt/lists/lock; do sleep 1; done
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 update"
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 upgrade -y"
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 install -y docker.io apparmor-profiles"
ssh root@${DROPLET_IPV4} "docker pull ghcr.io/aleph-im/vm-connector:alpha"
ssh root@${DROPLET_IPV4} "docker run -d -p 127.0.0.1:4021:4021/tcp --restart=always --name vm-connector ghcr.io/aleph-im/vm-connector:alpha"
scp packaging/target/${{ matrix.os_config.package_name }} root@${DROPLET_IPV4}:/opt
# "--force-confold" keeps existing config files during package install/upgrade, avoiding prompts.
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 -o Dpkg::Options::="--force-confold" install -y /opt/${{ matrix.os_config.package_name }}"
# Allow some time for IPFS Kubo to start
sleep 5
- name: Test Aleph-VM on the Droplet
id: test-aleph-vm
if: always()
continue-on-error: true
run: |
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/about/usage/system"
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/status/check/fastapi${{ matrix.check_vm.query_params }}"
- name: Test Aleph-VM on the Droplet again restarting the server first
if: steps.test-aleph-vm.outcome == 'failure'
run: |
# If the first execution fails, restart supervisor and try again
ssh root@${DROPLET_IPV4} "systemctl restart aleph-vm-supervisor"
sleep 5
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/status/check/fastapi${{ matrix.check_vm.query_params }}"
- name: Schedule an instance on the Droplet by faking a call from the scheduler
run: |
curl --retry 5 --max-time 10 --fail -X POST -H "Content-Type: application/json" \
-H "X-Auth-Signature: test" \
-d '{"persistent_vms": [], "instances": ["${{ matrix.check_vm.item_hash }}"]}' \
"http://${DROPLET_IPV4}:4020/control/allocations"
- name: Fetch system usage endpoint
run: |
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)"
curl -X GET -H "Content-Type: application/json" \
"http://${DROPLET_IPV4}:4020/about/usage/system"
- name: Run the sevctl command to ensure it's properly packaged and working
run: |
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)"
ssh root@${DROPLET_IPV4} "/opt/sevctl --version"
- name: Export aleph logs
if: always()
run: |
ssh root@${DROPLET_IPV4} "journalctl -u aleph-vm-supervisor"
- name: Cleanup
if: always()
run: |-
DROPLET_IDS=$(doctl compute droplet list --format "ID,Name" --no-header | grep "aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }}" | awk '{print $1}')
for DROPLET_ID in $DROPLET_IDS; do
echo "Deleting droplet with ID: $DROPLET_ID"
doctl compute droplet delete --force $DROPLET_ID
done