-
Notifications
You must be signed in to change notification settings - Fork 11
177 lines (153 loc) · 6.84 KB
/
deploy-eks.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
name: Create EKS cluster, deploy CKF and MLflow and run MLflow bundle tests
on:
workflow_dispatch:
inputs:
bundle_version:
description: 'Comma-separated list of bundle versions e.g. 2.15, latest. Make sure that the corresponding K8s version is supported by the cloud.'
default: '2.15, latest'
required: true
k8s_version:
description: 'Kubernetes version to be used for the AKS cluster'
required: false
uats_branch:
description: 'Branch to run the UATs from e.g. main or track/1.9. By default, this is defined by the dependencies.yaml file.'
required: false
schedule:
- cron: "17 0 * * 2"
jobs:
preprocess-input:
runs-on: ubuntu-22.04
outputs:
processed_bundle_versions: ${{ steps.process_bundle_versions.outputs.bundle_versions }}
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Process bundle versions
id: process_bundle_versions
run: python scripts/gh-actions/parse_versions.py
deploy-ckf-to-aks:
needs: preprocess-input
runs-on: ubuntu-22.04
strategy:
matrix:
bundle_version: ${{ fromJSON(needs.preprocess-input.outputs.processed_bundle_versions) }}
fail-fast: false
env:
AZURE_CORE_OUTPUT: none
PYTHON_VERSION: "3.8"
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Run YAML to Github Output Action
id: yaml-output
uses: christian-ci/action-yaml-github-output@v2
with:
file_path: ".github/dependencies.yaml"
main_key: ${{ matrix.bundle_version }}
- name: Update ENV variables from inputs if available
run: |
K8S_VERSION=${{ inputs.k8s_version || env.K8S_VERSION }}
echo "K8S_VERSION=${K8S_VERSION}" >> $GITHUB_ENV
UATS_BRANCH=${{ inputs.uats_branch || env.UATS_BRANCH }}
echo "UATS_BRANCH=${UATS_BRANCH}" >> $GITHUB_ENV
# Remove once https://github.com/canonical/bundle-kubeflow/issues/761
# is resolved and applied to uats repository.
- name: Install python ${{ env.PYTHON_VERSION }}
run: |
sudo add-apt-repository ppa:deadsnakes/ppa -y
sudo apt update -y
sudo apt install python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-distutils python${{ env.PYTHON_VERSION }}-venv -y
- name: Install CLI tools
run: |
wget https://bootstrap.pypa.io/get-pip.py
python${{ env.PYTHON_VERSION }} get-pip.py
python${{ env.PYTHON_VERSION }} -m pip install tox
sudo snap install juju --classic --channel=${{ env.JUJU_VERSION }}/stable
sudo snap install charmcraft --classic
juju version
- uses: azure/login@v1
with:
creds: ${{ secrets.BUNDLE_KUBEFLOW_AKS_SERVICE_PRINCIPAL }}
- name: Create resource group and cluster
run: |
# We need to remove the dot from version
# due to cluster naming restrictions
version=${{ matrix.bundle_version }}
KF_VERSION="kf-${version//.}"
RESOURCE_GROUP=${KF_VERSION}-ResourceGroup
NAME=${KF_VERSION}-AKSCluster
LOCATION=westeurope
echo "RESOURCE_GROUP=${RESOURCE_GROUP}" >> $GITHUB_ENV
echo "NAME=${NAME}" >> $GITHUB_ENV
echo "LOCATION=${LOCATION}" >> $GITHUB_ENV
az group create --name ${RESOURCE_GROUP} --location ${LOCATION}
az aks create \
--resource-group ${RESOURCE_GROUP} \
--name ${NAME} \
--kubernetes-version ${{ env.K8S_VERSION }} \
--node-count 2 \
--node-vm-size Standard_D8s_v3 \
--node-osdisk-size 100 \
--node-osdisk-type Managed \
--os-sku Ubuntu \
--no-ssh-key
- name: Add AKS cloud to juju and bootstrap controller
run: |
az aks get-credentials --resource-group ${{ env.RESOURCE_GROUP }} --name ${{ env.NAME }} --admin
juju add-k8s aks --client
juju bootstrap aks aks-controller
juju add-model kubeflow
- name: Test bundle deployment
run: |
tox -vve test_bundle_deployment-${{ matrix.bundle_version }} -- --model kubeflow --keep-models -vv -s
- name: Run Kubeflow UATs
run: |
git clone https://github.com/canonical/charmed-kubeflow-uats.git ~/charmed-kubeflow-uats
cd ~/charmed-kubeflow-uats
git checkout ${{ env.UATS_BRANCH }}
tox -e mlflow-remote
# On failure, capture debugging resources
- name: Save debug artifacts
uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main
if: always()
- name: Get juju status
run: juju status
if: failure() || cancelled()
- name: Get juju debug logs
run: juju debug-log --replay --no-tail
if: failure() || cancelled()
- name: Get all kubernetes resources
run: kubectl get all -A
if: failure() || cancelled()
- name: Describe all pods
if: failure() || cancelled()
run: kubectl describe pods --all-namespaces
- name: Get logs from pods with status = Pending
run: kubectl -n kubeflow get pods | tail -n +2 | grep Pending | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
- name: Get logs from pods with status = Failed
run: kubectl -n kubeflow get pods | tail -n +2 | grep Failed | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
- name: Get logs from pods with status = CrashLoopBackOff
run: kubectl -n kubeflow get pods | tail -n +2 | grep CrashLoopBackOff | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
- name: Delete AKS cluster
if: always()
run: az aks delete --resource-group ${{ env.RESOURCE_GROUP }} --name ${{ env.NAME }} --yes
- name: Delete resource groups
if: always()
run: |
az group delete --name ${{ env.RESOURCE_GROUP }} --yes
if [ "$(az group exists --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }})" = "true" ]; then
az group delete --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }} --yes
fi
- name: Check that resource groups have been deleted, else fail
if: always()
run: |
if [ "$(az group exists --name ${{ env.RESOURCE_GROUP }} )" = "true" ] || [ "$(az group exists --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }})" = "true" ]; then
exit 1
fi