Skip to content

ubuntu24.04 ci pipeline fix #459

ubuntu24.04 ci pipeline fix

ubuntu24.04 ci pipeline fix #459

Workflow file for this run

# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Run this workflow on a schedule
name: Precompiled images
# on:
# schedule:
# - cron: '00 09 * * *' # scheduled job
on:
pull_request:
types:
- opened
- synchronize
branches:
- ci-precompile-ubuntu24.04
push:
branches:
- ci-precompile-ubuntu24.04
jobs:
set-driver-version-matrix:
runs-on: linux-amd64-cpu4
outputs:
driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }}
kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
dist: ${{ steps.extract_driver_branch.outputs.dist }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Read driver versions
id: extract_driver_branch
run: |
# get driver_branch
# DRIVER_BRANCH=("535" "550" "565")
# DRIVER_BRANCH=("535" "550")
DRIVER_BRANCH=("550")
driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
# get kernel flavors
# KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
# KERNEL_FLAVORS=("azure" "generic" "nvidia" "oracle")
# KERNEL_FLAVORS=("aws")
KERNEL_FLAVORS=("generic")
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
# get ubuntu distributions
# DIST=("ubuntu22.04" "ubuntu24.04")
# DIST=("ubuntu22.04")
DIST=("ubuntu24.04")
dist_json=$(printf '%s\n' "${DIST[@]}" | jq -R . | jq -cs .)
echo "dist=$dist_json" >> $GITHUB_OUTPUT
precompiled-build-image:
needs: set-driver-version-matrix
runs-on: linux-amd64-cpu4
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.repository }}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
GENERATE_ARTIFACTS="false"
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver_branch }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_${{ matrix.dist }}
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} build-${DIST}-${DRIVER_VERSION}
- name: Save build image as a tar
env:
DIST: ${{ matrix.dist }}
PRIVATE_REGISTRY: "ghcr.io"
run: |
source kernel_version.txt
docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" \
-o ./driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar
# set env for artifacts upload
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=$DIST" >> $GITHUB_ENV
- name: Upload build image as an artifact
uses: actions/upload-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1
determine-e2e-test-matrix:
runs-on: linux-amd64-cpu4
strategy:
matrix:
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
needs:
- precompiled-build-image
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
dist: ${{ steps.set-driver-version-matrix.outputs.dist }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set kernel version
id: set_kernel_version
env:
BASE_TARGET: "jammy"
DIST: ${{ matrix.dist }}
run: |
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]'))
driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
source ./tests/scripts/ci-precompiled-helpers.sh
KERNEL_VERSIONS=($(get_kernel_versions_to_test $BASE_TARGET KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST))
if [ -z "$KERNEL_VERSIONS" ]; then
# no new kernel release
echo "Skipping e2e tests"
exit 0
fi
# Convert array to JSON format and assign
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: linux-amd64-cpu4
needs:
- determine-e2e-test-matrix
- set-driver-version-matrix
if: ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
strategy:
matrix:
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck_${{ matrix.dist }}.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=${{ matrix.dist }}" >> $GITHUB_ENV
driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}"
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV
- name: Install GitHub CLI
run: |
sudo apt-get update
sudo apt-get install -y gh
- name: Upgrade the kernel for Precompiled e2e test
env:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
run: |
status=0
./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
# other errors as code 1 will ensure proper management of reboot scenarios
if [ $status -eq 1 ]; then
echo "Kernel version $KERNEL_VERSION upgrade failed"
exit 1
fi
./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to remote instance"
exit $status
fi
- name: Precompiled e2e test gpu driver validation
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true \
--set driver.imagePullPolicy=Never"
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
rc=0
# for precompiled driver we are setting driver branch as driver version
DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }})
for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
image="driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}"
echo "Downloading $image in tests directory"
gh run download --name $image --dir ./tests/
status=0
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add escape character for space
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
rm -f $IMAGE_PATH
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
publish-precompiled-image:
runs-on: linux-amd64-cpu4
needs:
- set-driver-version-matrix
- determine-e2e-test-matrix
- e2e-tests-nvidiadriver
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set image vars
run: |
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
echo "DIST=${{ matrix.dist }}" >> $GITHUB_ENV
- name: Download built image artifact
uses: actions/download-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}
path: ./
- name: Publish image
run: |
image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}.tar"
echo "uploading $image_path"
docker load -i $image_path
docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}