diff --git a/.github/actions/kind/action.yml b/.github/actions/kind/action.yml index 59dcafef..f76e60af 100644 --- a/.github/actions/kind/action.yml +++ b/.github/actions/kind/action.yml @@ -1,6 +1,12 @@ name: "Set up KinD" description: "Step to start and configure KinD cluster" +inputs: + kind-node-hostname: + description: "Hostname of the main kind node" + required: false + default: kind + runs: using: "composite" steps: @@ -56,3 +62,13 @@ runs: curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f - kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true" kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all + + - name: Add ${{ inputs.kind-node-hostname }} host to machine hosts + shell: bash + run: echo "127.0.0.1 ${{ inputs.kind-node-hostname }}" | sudo tee -a /etc/hosts + + - name: Set env variables for tests to properly leverage KinD cluster + shell: bash + run: | + echo "CLUSTER_TYPE=KIND" >> $GITHUB_ENV + echo "CLUSTER_HOSTNAME=${{ inputs.kind-node-hostname }}" >> $GITHUB_ENV diff --git a/go.mod b/go.mod index 932d7614..98011af7 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.19 require ( github.com/onsi/gomega v1.27.10 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 - github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16 + github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb github.com/project-codeflare/instascale v0.3.0 github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0 github.com/ray-project/kuberay/ray-operator v1.0.0-rc.1 diff --git a/go.sum b/go.sum index c5ad83a7..42334303 100644 --- a/go.sum +++ b/go.sum @@ -391,8 +391,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16 h1:TRMLDP6IYt0CAd3+BkvY/r2lkpjI3sOsxf3tnQojZ9k= -github.com/project-codeflare/codeflare-common v0.0.0-20231023092720-93d03492db16/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk= +github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb h1:L2Gdr2SlvshDKZY2KK6507AwzQ1NSfRbMQuz5dOsYNM= +github.com/project-codeflare/codeflare-common v0.0.0-20231110155354-042fb171fcdb/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk= github.com/project-codeflare/instascale v0.3.0 h1:PSlwbqqUsFTkTQ5KUhMFRebfokySnEZwav97xZixLQs= github.com/project-codeflare/instascale v0.3.0/go.mod h1:IU1Wl+zqTpMpZ49BOcr6U+A6gF3AjcmFdKo9ZwP3TDI= github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.0 h1:dU2Ev0SijdNm30Y9mjdKJL1Fp6l07rnRBKhSbx1kX9g= diff --git a/test/e2e/mnist_raycluster_sdk.py b/test/e2e/mnist_raycluster_sdk.py index b830a004..cb3c0af5 100644 --- a/test/e2e/mnist_raycluster_sdk.py +++ b/test/e2e/mnist_raycluster_sdk.py @@ -1,4 +1,5 @@ import sys +import os from time import sleep @@ -8,17 +9,38 @@ from codeflare_sdk.job.jobs import DDPJobDefinition namespace = sys.argv[1] +ray_image = os.getenv('RAY_IMAGE') +host = os.getenv('CLUSTER_HOSTNAME') + +ingress_options = {} +if host is not None: + ingress_options = { + "ingresses": [ + { + "ingressName": "ray-dashboard", + "port": 8265, + "pathType": "Prefix", + "path": "/", + "host": host, + }, + ] + } + cluster = Cluster(ClusterConfiguration( name='mnist', namespace=namespace, num_workers=1, + head_cpus='500m', + head_memory=2, min_cpus='500m', max_cpus=1, min_memory=0.5, - max_memory=1, + max_memory=2, num_gpus=0, instascale=False, + image=ray_image, + ingress_options=ingress_options, )) cluster.up() diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index aeede52e..015cbfc5 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -40,104 +40,59 @@ func TestMNISTRayClusterSDK(t *testing.T) { test := With(t) test.T().Parallel() - // Currently blocked by https://github.com/project-codeflare/codeflare-sdk/pull/251 , remove the skip once SDK with the PR is released - test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/251") - // Create a namespace namespace := test.NewTestNamespace() // Test configuration - config := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "mnist-raycluster-sdk", - Namespace: namespace.Name, + config := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // SDK script + "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }) + + // Create RBAC, retrieve token for user with limited rights + policyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, }, - BinaryData: map[string][]byte{ - // SDK script - "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, }, - Immutable: Ptr(true), - } - config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) - - // SDK client RBAC - serviceAccount := &corev1.ServiceAccount{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ServiceAccount", + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk-user", - Namespace: namespace.Name, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"networking.k8s.io"}, + Resources: []string{"ingresses"}, }, } - serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - role := &rbacv1.Role{ - TypeMeta: metav1.TypeMeta{ - APIVersion: rbacv1.SchemeGroupVersion.String(), - Kind: "Role", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - Namespace: namespace.Name, - }, - Rules: []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, - APIGroups: []string{mcadv1beta1.GroupName}, - Resources: []string{"appwrappers"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{rayv1.GroupVersion.Group}, - Resources: []string{"rayclusters", "rayclusters/status"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"route.openshift.io"}, - Resources: []string{"routes"}, - }, + // Create cluster wide RBAC, required for SDK OpenShift check + // TODO reevaluate once SDK change OpenShift detection logic + clusterPolicyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"config.openshift.io"}, + Resources: []string{"ingresses"}, + ResourceNames: []string{"cluster"}, }, } - role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - roleBinding := &rbacv1.RoleBinding{ - TypeMeta: metav1.TypeMeta{ - APIVersion: rbacv1.SchemeGroupVersion.String(), - Kind: "RoleBinding", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - }, - RoleRef: rbacv1.RoleRef{ - APIGroup: rbacv1.SchemeGroupVersion.Group, - Kind: "Role", - Name: role.Name, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - APIGroup: corev1.SchemeGroupVersion.Group, - Name: serviceAccount.Name, - Namespace: serviceAccount.Namespace, - }, - }, - } - _, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) + sa := CreateServiceAccount(test, namespace.Name) + role := CreateRole(test, namespace.Name, policyRules) + CreateRoleBinding(test, namespace.Name, sa, role) + clusterRole := CreateClusterRole(test, clusterPolicyRules) + CreateClusterRoleBinding(test, sa, clusterRole) job := &batchv1.Job{ TypeMeta: metav1.TypeMeta{ @@ -161,7 +116,8 @@ func TestMNISTRayClusterSDK(t *testing.T) { // See https://github.com/project-codeflare/codeflare-sdk/pull/146 Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", Env: []corev1.EnvVar{ - corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "RAY_IMAGE", Value: GetRayImage()}, }, Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name}, VolumeMounts: []corev1.VolumeMount{ @@ -206,12 +162,31 @@ func TestMNISTRayClusterSDK(t *testing.T) { }, }, RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: serviceAccount.Name, + ServiceAccountName: sa.Name, }, }, }, } - job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) + if GetClusterType(test) == KindCluster { + // Take first KinD node and redirect pod hostname requests there + node := GetNodes(test)[0] + hostname := GetClusterHostname(test) + IP := GetNodeInternalIP(test, node) + + test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) + job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ + { + IP: IP, + Hostnames: []string{hostname}, + }, + } + + // Propagate hostname into Python code as env variable + hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} + job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) + } + + job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 725ced9d..b8d3f4d0 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -108,7 +108,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1"), - corev1.ResourceMemory: resource.MustParse("1G"), + corev1.ResourceMemory: resource.MustParse("2G"), }, }, VolumeMounts: []corev1.VolumeMount{ @@ -168,7 +168,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1"), - corev1.ResourceMemory: resource.MustParse("1G"), + corev1.ResourceMemory: resource.MustParse("2G"), }, }, },