Skip to content

Commit

Permalink
Add tox for running MLflow UATs
Browse files Browse the repository at this point in the history
  • Loading branch information
misohu committed Sep 26, 2024
1 parent 5b7efe6 commit e934b5c
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 59 deletions.
125 changes: 67 additions & 58 deletions .github/workflows/deploy-eks.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Create EKS cluster, deploy CKF and MLflow and run MLflow bundle tests
on:
workflow_dispatch:
workflow_dispatch: # This event allows manual triggering from the Github UI
inputs:
bundle_version:
description: 'Comma-separated list of bundle versions e.g. 2.15, latest. Make sure that the corresponding K8s version is supported by the cloud.'
Expand All @@ -13,7 +13,7 @@ on:
description: 'Branch to run the UATs from e.g. main or track/1.9. By default, this is defined by the dependencies.yaml file.'
required: false
schedule:
- cron: "17 0 * * 2"
- cron: "23 0 * * 2"
jobs:
preprocess-input:
runs-on: ubuntu-22.04
Expand All @@ -32,15 +32,14 @@ jobs:
id: process_bundle_versions
run: python scripts/gh-actions/parse_versions.py

deploy-ckf-to-aks:
deploy-ckf-to-eks:
needs: preprocess-input
runs-on: ubuntu-22.04
strategy:
matrix:
bundle_version: ${{ fromJSON(needs.preprocess-input.outputs.processed_bundle_versions) }}
fail-fast: false
env:
AZURE_CORE_OUTPUT: none
PYTHON_VERSION: "3.8"

steps:
Expand Down Expand Up @@ -74,47 +73,64 @@ jobs:
wget https://bootstrap.pypa.io/get-pip.py
python${{ env.PYTHON_VERSION }} get-pip.py
python${{ env.PYTHON_VERSION }} -m pip install tox
sudo snap install juju --classic --channel=${{ env.JUJU_VERSION }}/stable
sudo snap install charmcraft --classic
# We need to install from binary because of this https://bugs.launchpad.net/juju/+bug/2007575
curl -LO https://launchpad.net/juju/${{ env.JUJU_VERSION }}/${{ env.JUJU_VERSION_WITH_PATCH }}/+download/juju-${{ env.JUJU_VERSION_WITH_PATCH }}-linux-amd64.tar.xz
tar xf juju-${{ env.JUJU_VERSION_WITH_PATCH }}-linux-amd64.tar.xz
sudo install -o root -g root -m 0755 juju /usr/local/bin/juju
juju version
- uses: azure/login@v1
with:
creds: ${{ secrets.BUNDLE_KUBEFLOW_AKS_SERVICE_PRINCIPAL }}

- name: Create resource group and cluster
- name: Configure AWS Credentials
env:
AWS_ACCESS_KEY_ID: ${{ secrets.BUNDLE_KUBEFLOW_EKS_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.BUNDLE_KUBEFLOW_EKS_AWS_SECRET_ACCESS_KEY }}
run: |
aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
aws configure set default.region eu-central-1
- name: Install kubectl
run: |
sudo snap install kubectl --classic --channel=${{ env.K8S_VERSION }}/stable
mkdir ~/.kube
kubectl version --client
- name: Install eksctl
run: |
sudo apt-get update
sudo apt-get install -y unzip
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
eksctl version
- name: Create cluster
run: |
# We need to remove the dot from version
# due to cluster naming restrictions
version=${{ matrix.bundle_version }}
KF_VERSION="kf-${version//.}"
RESOURCE_GROUP=${KF_VERSION}-ResourceGroup
NAME=${KF_VERSION}-AKSCluster
LOCATION=westeurope
echo "RESOURCE_GROUP=${RESOURCE_GROUP}" >> $GITHUB_ENV
echo "NAME=${NAME}" >> $GITHUB_ENV
echo "LOCATION=${LOCATION}" >> $GITHUB_ENV
az group create --name ${RESOURCE_GROUP} --location ${LOCATION}
az aks create \
--resource-group ${RESOURCE_GROUP} \
--name ${NAME} \
--kubernetes-version ${{ env.K8S_VERSION }} \
--node-count 2 \
--node-vm-size Standard_D8s_v3 \
--node-osdisk-size 100 \
--node-osdisk-type Managed \
--os-sku Ubuntu \
--no-ssh-key
- name: Add AKS cloud to juju and bootstrap controller
VERSION=${{ matrix.bundle_version }}
VERSION_WITHOUT_DOT="${VERSION//.}"
yq e ".metadata.name |= \"kubeflow-test-$VERSION_WITHOUT_DOT\"" -i .github/cluster.yaml
yq e ".metadata.version |= \"${{ env.K8S_VERSION }}\"" -i .github/cluster.yaml
eksctl create cluster -f .github/cluster.yaml
kubectl get nodes
- name: Setup juju
run: |
az aks get-credentials --resource-group ${{ env.RESOURCE_GROUP }} --name ${{ env.NAME }} --admin
juju add-k8s aks --client
juju bootstrap aks aks-controller
juju add-k8s eks --client
juju bootstrap eks kubeflow-controller
juju add-model kubeflow
- name: Test bundle deployment
run: |
juju deploy kubeflow --channel=$KUBEFLOW_CHANNEL --trust &&
juju deploy ./releases/2.15/stable/mlflow/bundle.yaml --trust &&
juju deploy resource-dispatcher --channel=$RESOURCE_DISPATCHER_CHANNEL --trust &&
juju integrate mlflow-server:secrets resource-dispatcher:secrets &&
juju integrate mlflow-server:pod-defaults resource-dispatcher:pod-defaults &&
juju integrate mlflow-minio:object-storage kserve-controller:object-storage &&
juju integrate kserve-controller:service-accounts resource-dispatcher:service-accounts &&
juju integrate kserve-controller:secrets resource-dispatcher:secrets &&
juju integrate mlflow-server:ingress istio-pilot:ingress &&
juju integrate mlflow-server:dashboard-links kubeflow-dashboard:links
tox -vve test_bundle_deployment-${{ matrix.bundle_version }} -- --model kubeflow --keep-models -vv -s
- name: Run Kubeflow UATs
Expand All @@ -127,8 +143,9 @@ jobs:
# On failure, capture debugging resources
- name: Save debug artifacts
uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main
if: always()
if: failure() || cancelled()

# On failure, capture debugging resources
- name: Get juju status
run: juju status
if: failure() || cancelled()
Expand All @@ -141,10 +158,6 @@ jobs:
run: kubectl get all -A
if: failure() || cancelled()

- name: Describe all pods
if: failure() || cancelled()
run: kubectl describe pods --all-namespaces

- name: Get logs from pods with status = Pending
run: kubectl -n kubeflow get pods | tail -n +2 | grep Pending | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
Expand All @@ -157,21 +170,17 @@ jobs:
run: kubectl -n kubeflow get pods | tail -n +2 | grep CrashLoopBackOff | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()

- name: Delete AKS cluster
if: always()
run: az aks delete --resource-group ${{ env.RESOURCE_GROUP }} --name ${{ env.NAME }} --yes

- name: Delete resource groups
if: always()
run: |
az group delete --name ${{ env.RESOURCE_GROUP }} --yes
if [ "$(az group exists --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }})" = "true" ]; then
az group delete --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }} --yes
fi
- name: Check that resource groups have been deleted, else fail
- name: Delete EKS cluster
if: always()
run: |
if [ "$(az group exists --name ${{ env.RESOURCE_GROUP }} )" = "true" ] || [ "$(az group exists --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }})" = "true" ]; then
exit 1
fi
VERSION=${{ matrix.bundle_version }}
VERSION_WITHOUT_DOT="${VERSION//.}"
eksctl delete cluster --region eu-central-1 --name=kubeflow-test-$VERSION_WITHOUT_DOT
delete-unattached-volumes:
if: always()
uses: ./.github/workflows/delete-aws-volumes.yaml
secrets: inherit
with:
region: eu-central-1
needs: [deploy-ckf-to-eks]
2 changes: 1 addition & 1 deletion tests/integration/test_bundle_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class TestCharm:
@pytest.mark.abort_on_fail
async def test_bundle_deployment_works(self, ops_test: OpsTest, lightkube_client, bundle_path):
# Run all Juju commands in a single subprocess call
run_juju_commands_in_one(bundle_path, KUBEFLOW_CHANNEL, RESOURCE_DISPATCHER_CHANNEL)
# run_juju_commands_in_one(bundle_path, KUBEFLOW_CHANNEL, RESOURCE_DISPATCHER_CHANNEL)

# Wait for the model to become active and idle
await ops_test.model.wait_for_idle(
Expand Down

0 comments on commit e934b5c

Please sign in to comment.