Skip to content

Commit

Permalink
Enable TestMNISTRayClusterSDK test
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar authored and openshift-merge-bot[bot] committed Nov 10, 2023
1 parent e95759a commit 72cace9
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 94 deletions.
16 changes: 16 additions & 0 deletions .github/actions/kind/action.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
name: "Set up KinD"
description: "Step to start and configure KinD cluster"

inputs:
kind-node-hostname:
description: "Hostname of the main kind node"
required: false
default: kind

runs:
using: "composite"
steps:
Expand Down Expand Up @@ -56,3 +62,13 @@ runs:
curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f -
kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true"
kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all
- name: Add ${{ inputs.kind-node-hostname }} host to machine hosts
shell: bash
run: echo "127.0.0.1 ${{ inputs.kind-node-hostname }}" | sudo tee -a /etc/hosts

- name: Set env variables for tests to properly leverage KinD cluster
shell: bash
run: |
echo "CLUSTER_TYPE=KIND" >> $GITHUB_ENV
echo "CLUSTER_HOSTNAME=${{ inputs.kind-node-hostname }}" >> $GITHUB_ENV
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.19
require (
github.com/onsi/gomega v1.27.10
github.com/openshift/api v0.0.0-20230213134911-7ba313770556
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb
github.com/project-codeflare/instascale v0.3.0
github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0
github.com/ray-project/kuberay/ray-operator v1.0.0-rc.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16 h1:TRMLDP6IYt0CAd3+BkvY/r2lkpjI3sOsxf3tnQojZ9k=
github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM=
github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk=
github.com/project-codeflare/instascale v0.3.0 h1:PSlwbqqUsFTkTQ5KUhMFRebfokySnEZwav97xZixLQs=
github.com/project-codeflare/instascale v0.3.0/go.mod h1:IU1Wl+zqTpMpZ49BOcr6U+A6gF3AjcmFdKo9ZwP3TDI=
github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0 h1:dU2Ev0SijdNm30Y9mjdKJL1Fp6l07rnRBKhSbx1kX9g=
Expand Down
24 changes: 23 additions & 1 deletion test/e2e/mnist_raycluster_sdk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
import os

from time import sleep

Expand All @@ -8,17 +9,38 @@
from codeflare_sdk.job.jobs import DDPJobDefinition

namespace = sys.argv[1]
ray_image = os.getenv('RAY_IMAGE')
host = os.getenv('CLUSTER_HOSTNAME')

ingress_options = {}
if host is not None:
ingress_options = {
"ingresses": [
{
"ingressName": "ray-dashboard",
"port": 8265,
"pathType": "Prefix",
"path": "/",
"host": host,
},
]
}


cluster = Cluster(ClusterConfiguration(
name='mnist',
namespace=namespace,
num_workers=1,
head_cpus='500m',
head_memory=2,
min_cpus='500m',
max_cpus=1,
min_memory=0.5,
max_memory=1,
max_memory=2,
num_gpus=0,
instascale=False,
image=ray_image,
ingress_options=ingress_options,
))

cluster.up()
Expand Down
151 changes: 63 additions & 88 deletions test/e2e/mnist_raycluster_sdk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,104 +40,59 @@ func TestMNISTRayClusterSDK(t *testing.T) {
test := With(t)
test.T().Parallel()

// Currently blocked by https://github.com/project-codeflare/codeflare-sdk/pull/251 , remove the skip once SDK with the PR is released
test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/251")

// Create a namespace
namespace := test.NewTestNamespace()

// Test configuration
config := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
Name: "mnist-raycluster-sdk",
Namespace: namespace.Name,
config := CreateConfigMap(test, namespace.Name, map[string][]byte{
// SDK script
"mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"),
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
})

// Create RBAC, retrieve token for user with limited rights
policyRules := []rbacv1.PolicyRule{
{
Verbs: []string{"get", "create", "delete", "list", "patch", "update"},
APIGroups: []string{mcadv1beta1.GroupName},
Resources: []string{"appwrappers"},
},
BinaryData: map[string][]byte{
// SDK script
"mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"),
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
{
Verbs: []string{"get", "list"},
APIGroups: []string{rayv1.GroupVersion.Group},
Resources: []string{"rayclusters", "rayclusters/status"},
},
Immutable: Ptr(true),
}
config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)

// SDK client RBAC
serviceAccount := &corev1.ServiceAccount{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ServiceAccount",
{
Verbs: []string{"get", "list"},
APIGroups: []string{"route.openshift.io"},
Resources: []string{"routes"},
},
ObjectMeta: metav1.ObjectMeta{
Name: "sdk-user",
Namespace: namespace.Name,
{
Verbs: []string{"get", "list"},
APIGroups: []string{"networking.k8s.io"},
Resources: []string{"ingresses"},
},
}
serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())

role := &rbacv1.Role{
TypeMeta: metav1.TypeMeta{
APIVersion: rbacv1.SchemeGroupVersion.String(),
Kind: "Role",
},
ObjectMeta: metav1.ObjectMeta{
Name: "sdk",
Namespace: namespace.Name,
},
Rules: []rbacv1.PolicyRule{
{
Verbs: []string{"get", "create", "delete", "list", "patch", "update"},
APIGroups: []string{mcadv1beta1.GroupName},
Resources: []string{"appwrappers"},
},
{
Verbs: []string{"get", "list"},
APIGroups: []string{rayv1.GroupVersion.Group},
Resources: []string{"rayclusters", "rayclusters/status"},
},
{
Verbs: []string{"get", "list"},
APIGroups: []string{"route.openshift.io"},
Resources: []string{"routes"},
},
// Create cluster wide RBAC, required for SDK OpenShift check
// TODO reevaluate once SDK change OpenShift detection logic
clusterPolicyRules := []rbacv1.PolicyRule{
{
Verbs: []string{"get", "list"},
APIGroups: []string{"config.openshift.io"},
Resources: []string{"ingresses"},
ResourceNames: []string{"cluster"},
},
}
role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())

roleBinding := &rbacv1.RoleBinding{
TypeMeta: metav1.TypeMeta{
APIVersion: rbacv1.SchemeGroupVersion.String(),
Kind: "RoleBinding",
},
ObjectMeta: metav1.ObjectMeta{
Name: "sdk",
},
RoleRef: rbacv1.RoleRef{
APIGroup: rbacv1.SchemeGroupVersion.Group,
Kind: "Role",
Name: role.Name,
},
Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
APIGroup: corev1.SchemeGroupVersion.Group,
Name: serviceAccount.Name,
Namespace: serviceAccount.Namespace,
},
},
}
_, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
sa := CreateServiceAccount(test, namespace.Name)
role := CreateRole(test, namespace.Name, policyRules)
CreateRoleBinding(test, namespace.Name, sa, role)
clusterRole := CreateClusterRole(test, clusterPolicyRules)
CreateClusterRoleBinding(test, sa, clusterRole)

job := &batchv1.Job{
TypeMeta: metav1.TypeMeta{
Expand All @@ -161,7 +116,8 @@ func TestMNISTRayClusterSDK(t *testing.T) {
// See https://github.com/project-codeflare/codeflare-sdk/pull/146
Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e",
Env: []corev1.EnvVar{
corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"},
{Name: "PYTHONUSERBASE", Value: "/workdir"},
{Name: "RAY_IMAGE", Value: GetRayImage()},
},
Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name},
VolumeMounts: []corev1.VolumeMount{
Expand Down Expand Up @@ -206,12 +162,31 @@ func TestMNISTRayClusterSDK(t *testing.T) {
},
},
RestartPolicy: corev1.RestartPolicyNever,
ServiceAccountName: serviceAccount.Name,
ServiceAccountName: sa.Name,
},
},
},
}
job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{})
if GetClusterType(test) == KindCluster {
// Take first KinD node and redirect pod hostname requests there
node := GetNodes(test)[0]
hostname := GetClusterHostname(test)
IP := GetNodeInternalIP(test, node)

test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP)
job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{
{
IP: IP,
Hostnames: []string{hostname},
},
}

// Propagate hostname into Python code as env variable
hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname}
job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar)
}

job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name)

Expand Down
4 changes: 2 additions & 2 deletions test/e2e/mnist_rayjob_mcad_raycluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1G"),
corev1.ResourceMemory: resource.MustParse("2G"),
},
},
VolumeMounts: []corev1.VolumeMount{
Expand Down Expand Up @@ -168,7 +168,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) {
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1G"),
corev1.ResourceMemory: resource.MustParse("2G"),
},
},
},
Expand Down

0 comments on commit 72cace9

Please sign in to comment.