diff --git a/build/images/tf_operator/Dockerfile b/build/images/tf_operator/Dockerfile deleted file mode 100644 index b83f31d2a7..0000000000 --- a/build/images/tf_operator/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM golang:1.13.5 AS build-image - -ADD . /go/src/github.com/kubeflow/tf-operator - -WORKDIR /go/src/github.com/kubeflow/tf-operator - -RUN if [ "$(uname -m)" = "aarch64" ]; then \ - GO111MODULE="on" CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o tf-operator.v1 ./cmd/tf-operator.v1; \ - else \ - GO111MODULE="on" CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o tf-operator.v1 ./cmd/tf-operator.v1; \ - fi - -FROM gcr.io/distroless/base-debian10 - -COPY third_party/library/license.txt /opt/license.txt - -COPY --from=build-image /go/src/github.com/kubeflow/tf-operator/tf-operator.v1 /opt/ - -ENTRYPOINT ["/opt/tf-operator.v1"] diff --git a/cmd/tf-operator.v1/app/options/options.go b/cmd/tf-operator.v1/app/options/options.go deleted file mode 100644 index 433f94d03d..0000000000 --- a/cmd/tf-operator.v1/app/options/options.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package options - -import ( - "flag" - "time" - - v1 "k8s.io/api/core/v1" -) - -const DefaultResyncPeriod = 12 * time.Hour - -// ServerOption is the main context object for the controller manager. -type ServerOption struct { - Kubeconfig string - MasterURL string - Threadiness int - PrintVersion bool - JSONLogFormat bool - EnableGangScheduling bool - GangSchedulerName string - Namespace string - MonitoringPort int - ResyncPeriod time.Duration - // QPS indicates the maximum QPS to the master from this client. - // If it's zero, the created RESTClient will use DefaultQPS: 5 - QPS int - // Maximum burst for throttle. - // If it's zero, the created RESTClient will use DefaultBurst: 10. - Burst int -} - -// NewServerOption creates a new CMServer with a default config. -func NewServerOption() *ServerOption { - s := ServerOption{} - return &s -} - -// AddFlags adds flags for a specific CMServer to the specified FlagSet. -func (s *ServerOption) AddFlags(fs *flag.FlagSet) { - //fs.StringVar(&s.Kubeconfig, "kubeconfig", "", "The path of kubeconfig file") - - fs.StringVar(&s.MasterURL, "master", "", - `The url of the Kubernetes API server, - will overrides any value in kubeconfig, only required if out-of-cluster.`) - - fs.StringVar(&s.Namespace, "namespace", v1.NamespaceAll, - `The namespace to monitor tfjobs. If unset, it monitors all namespaces cluster-wide. - If set, it only monitors tfjobs in the given namespace.`) - - fs.IntVar(&s.Threadiness, "threadiness", 1, - `How many threads to process the main logic`) - - fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit") - - fs.BoolVar(&s.JSONLogFormat, "json-log-format", true, - "Set true to use json style log format. Set false to use plaintext style log format") - - fs.BoolVar(&s.EnableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling") - fs.StringVar(&s.GangSchedulerName, "gang-scheduler-name", "volcano", "The scheduler to gang-schedule tfjobs, defaults to volcano") - - fs.IntVar(&s.MonitoringPort, "monitoring-port", 8443, - `Endpoint port for displaying monitoring metrics. -It can be set to "0" to disable the metrics serving.`) - - fs.DurationVar(&s.ResyncPeriod, "resyc-period", DefaultResyncPeriod, "Resync interval of the tf-operator") - - fs.IntVar(&s.QPS, "qps", 5, "QPS indicates the maximum QPS to the master from this client.") - fs.IntVar(&s.Burst, "burst", 10, "Maximum burst for throttle.") -} diff --git a/cmd/tf-operator.v1/app/server.go b/cmd/tf-operator.v1/app/server.go deleted file mode 100644 index 2bb6d6ef39..0000000000 --- a/cmd/tf-operator.v1/app/server.go +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package app - -import ( - "context" - "fmt" - "os" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - log "github.com/sirupsen/logrus" - corev1 "k8s.io/api/core/v1" - apiextensionclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/uuid" - kubeinformers "k8s.io/client-go/informers" - kubeclientset "k8s.io/client-go/kubernetes" - restclientset "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - election "k8s.io/client-go/tools/leaderelection" - "k8s.io/client-go/tools/leaderelection/resourcelock" - "k8s.io/client-go/tools/record" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" - - "github.com/kubeflow/common/pkg/util/signals" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" - "github.com/kubeflow/tf-operator/pkg/common" - controller "github.com/kubeflow/tf-operator/pkg/controller.v1/tensorflow" - "github.com/kubeflow/tf-operator/pkg/version" -) - -const ( - apiVersion = "v1" -) - -var ( - // leader election config - leaseDuration = 15 * time.Second - renewDuration = 5 * time.Second - retryPeriod = 3 * time.Second -) - -// RecommendedKubeConfigPathEnv is the environment variable name for kubeconfig. -const RecommendedKubeConfigPathEnv = "KUBECONFIG" - -var ( - isLeader = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "tf_operator_is_leader", - Help: "Is this client the leader of this tf-operator client set?", - }) -) - -// Run runs the server. -func Run(opt *options.ServerOption) error { - // Check if the -version flag was passed and, if so, print the version and exit. - if opt.PrintVersion { - version.PrintVersionAndExit(apiVersion) - } - - namespace := os.Getenv(common.EnvKubeflowNamespace) - if len(namespace) == 0 { - log.Infof("EnvKubeflowNamespace not set, use default namespace %s", - metav1.NamespaceDefault) - namespace = metav1.NamespaceDefault - } - if opt.Namespace == corev1.NamespaceAll { - log.Info("Using cluster scoped operator") - } else { - log.Infof("Scoping operator to namespace %s", opt.Namespace) - } - - // To help debugging, immediately log version. - log.Infof("%+v", version.Info(apiVersion)) - - // Set up signals so we handle the first shutdown signal gracefully. - stopCh := signals.SetupSignalHandler() - - // Note: ENV KUBECONFIG will overwrite user defined Kubeconfig option. - if len(os.Getenv(RecommendedKubeConfigPathEnv)) > 0 { - // use the current context in kubeconfig - // This is very useful for running locally. - opt.Kubeconfig = os.Getenv(RecommendedKubeConfigPathEnv) - } - - // Get kubernetes config. - kcfg, err := clientcmd.BuildConfigFromFlags(opt.MasterURL, opt.Kubeconfig) - if err != nil { - log.Fatalf("Error building kubeconfig: %s", err.Error()) - } - - // Set client qps and burst by opt. - kcfg.QPS = float32(opt.QPS) - kcfg.Burst = opt.Burst - log.Infof( - "Creating client sets and informers with QPS %d, burst %d, resync period %s", - opt.QPS, opt.Burst, opt.ResyncPeriod.String()) - - // Create clients. - kubeClientSet, leaderElectionClientSet, - apiextensionClientSet, tfJobClientSet, - volcanoClientSet, err := createClientSets(kcfg) - if err != nil { - log.Fatalf("Error create client set : %s", err.Error()) - return err - } - if !checkCRDExists(apiextensionClientSet, opt.Namespace) { - return fmt.Errorf("Failed to get the expected TFJobs with API version %s", - tfJobClientSet.KubeflowV1().RESTClient().APIVersion()) - } - // Create informer factory. - kubeInformerFactory := kubeinformers.NewFilteredSharedInformerFactory(kubeClientSet, opt.ResyncPeriod, opt.Namespace, nil) - tfJobInformerFactory := tfjobinformers.NewSharedInformerFactory(tfJobClientSet, opt.ResyncPeriod) - - unstructuredInformer := controller.NewUnstructuredTFJobInformer( - kcfg, opt.Namespace, opt.ResyncPeriod) - - // Create tf controller. - tc := controller.NewTFController(unstructuredInformer, kubeClientSet, volcanoClientSet, tfJobClientSet, kubeInformerFactory, tfJobInformerFactory, *opt) - - // Start informer goroutines. - go kubeInformerFactory.Start(stopCh) - - // We do not use the generated informer because of - // https://github.com/kubeflow/tf-operator/issues/561 - // go tfJobInformerFactory.Start(stopCh) - go unstructuredInformer.Informer().Run(stopCh) - - // Set leader election start function. - run := func(context.Context) { - isLeader.Set(1) - if err := tc.Run(opt.Threadiness, stopCh); err != nil { - log.Errorf("Failed to run the controller: %v", err) - } - } - - id, err := os.Hostname() - if err != nil { - return fmt.Errorf("failed to get hostname: %v", err) - } - // add a uniquifier so that two processes on the same host don't accidentally both become active - id = id + "_" + string(uuid.NewUUID()) - - // Prepare event clients. - eventBroadcaster := record.NewBroadcaster() - if err = corev1.AddToScheme(scheme.Scheme); err != nil { - return fmt.Errorf("CoreV1 Add Scheme failed: %v", err) - } - recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "tf-operator"}) - - rl := &resourcelock.EndpointsLock{ - EndpointsMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: "tf-operator", - }, - Client: leaderElectionClientSet.CoreV1(), - LockConfig: resourcelock.ResourceLockConfig{ - Identity: id, - EventRecorder: recorder, - }, - } - - // Start leader election. - election.RunOrDie(context.TODO(), election.LeaderElectionConfig{ - Lock: rl, - LeaseDuration: leaseDuration, - RenewDeadline: renewDuration, - RetryPeriod: retryPeriod, - Callbacks: election.LeaderCallbacks{ - OnStartedLeading: run, - OnStoppedLeading: func() { - isLeader.Set(0) - log.Fatalf("leader election lost") - }, - }, - }) - - return nil -} - -func createClientSets(config *restclientset.Config) ( - kubeclientset.Interface, kubeclientset.Interface, - apiextensionclientset.Interface, tfjobclientset.Interface, - volcanoclient.Interface, error) { - - kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "tf-operator")) - if err != nil { - return nil, nil, nil, nil, nil, err - } - - leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election")) - if err != nil { - return nil, nil, nil, nil, nil, err - } - - apiextensionClientSet, err := apiextensionclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election")) - if err != nil { - return nil, nil, nil, nil, nil, err - } - - tfJobClientSet, err := tfjobclientset.NewForConfig(config) - if err != nil { - return nil, nil, nil, nil, nil, err - } - - volcanoClientSet, err := volcanoclient.NewForConfig(restclientset.AddUserAgent(config, "volcano")) - if err != nil { - return nil, nil, nil, nil, nil, err - } - - return kubeClientSet, leaderElectionClientSet, apiextensionClientSet, tfJobClientSet, volcanoClientSet, nil -} - -// checkCRDExists checks if the CRD exists. -func checkCRDExists(clientset apiextensionclientset.Interface, namespace string) bool { - crd, err := clientset.ApiextensionsV1beta1(). - CustomResourceDefinitions(). - Get(context.TODO(), "tfjobs.kubeflow.org", metav1.GetOptions{}) - - if err != nil { - log.Error(err) - if _, ok := err.(*errors.StatusError); ok { - if errors.IsNotFound(err) { - return false - } - } else { - return false - } - } - - log.Infof("CRD %s/%s %s is registered", - crd.Spec.Group, crd.Spec.Version, crd.Spec.Names.Singular) - return true -} diff --git a/cmd/tf-operator.v1/main.go b/cmd/tf-operator.v1/main.go deleted file mode 100644 index 39f154cecc..0000000000 --- a/cmd/tf-operator.v1/main.go +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "flag" - "fmt" - "net/http" - _ "net/http/pprof" - "strconv" - - "github.com/onrik/logrus/filename" - "github.com/prometheus/client_golang/prometheus/promhttp" - log "github.com/sirupsen/logrus" - - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" -) - -func init() { - // Add filename as one of the fields of the structured log message. - filenameHook := filename.NewHook() - filenameHook.Field = "filename" - log.AddHook(filenameHook) -} - -func startMonitoring(monitoringPort int) { - if monitoringPort != 0 { - go func() { - log.Infof("Setting up client for monitoring on port: %s", strconv.Itoa(monitoringPort)) - http.Handle("/metrics", promhttp.Handler()) - err := http.ListenAndServe(fmt.Sprintf(":%s", strconv.Itoa(monitoringPort)), nil) - if err != nil { - log.Error("Monitoring endpoint setup failure.", err) - } - }() - } -} - -func main() { - s := options.NewServerOption() - s.AddFlags(flag.CommandLine) - - flag.Parse() - - if s.JSONLogFormat { - // Output logs in a json format so that it can be parsed by services like Stackdriver. - log.SetFormatter(&log.JSONFormatter{}) - } - - startMonitoring(s.MonitoringPort) - - if err := app.Run(s); err != nil { - log.Fatalf("Failed to run: %v", err) - } -} diff --git a/docs/development/developer_guide.md b/docs/development/developer_guide.md index 2196fa6d34..34eda7cc50 100644 --- a/docs/development/developer_guide.md +++ b/docs/development/developer_guide.md @@ -22,7 +22,7 @@ GO111MODULE="on" go mod vendor Build it ```sh -go install github.com/kubeflow/tf-operator/cmd/tf-operator.v1 +go install github.com/kubeflow/tf-operator/cmd/training-operator.v1 ``` ## Running the Operator Locally @@ -57,7 +57,7 @@ export KUBEFLOW_NAMESPACE=$(your_namespace) After the cluster is up, the TFJob CRD should be created on the cluster. ```bash -kubectl create -f ./examples/crd/crd-v1.yaml +make install ``` ### Run Operator @@ -65,7 +65,7 @@ kubectl create -f ./examples/crd/crd-v1.yaml Now we are ready to run operator locally: ```sh -tf-operator +make run ``` To verify local operator is working, create an example job and you should see jobs created by it. diff --git a/pkg/controller.v1/mxnet/mxjob_controller.go b/pkg/controller.v1/mxnet/mxjob_controller.go index 970699f09d..888569575b 100644 --- a/pkg/controller.v1/mxnet/mxjob_controller.go +++ b/pkg/controller.v1/mxnet/mxjob_controller.go @@ -57,7 +57,7 @@ import ( ) const ( - controllerName = "mxnet-operator" + controllerName = "mxjob-controller" // mxJobCreatedReason is added in a mxjob when it is created. mxJobCreatedReason = "MXJobCreated" @@ -73,7 +73,7 @@ const ( var ( jobOwnerKey = ".metadata.controller" - // DefaultMXControllerConfiguration is the suggested mxnet-operator configuration for production. + // DefaultMXControllerConfiguration is the suggested mxnetjob-controller configuration for production. DefaultMXControllerConfiguration = common.JobControllerConfiguration{ ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 15 * time.Second}, EnableGangScheduling: false, diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index a1b8bd6a22..b52abd359c 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -56,7 +56,7 @@ import ( ) const ( - controllerName = "pytorchjob-operator" + controllerName = "pytorchjob-controller" ) var ( diff --git a/pkg/controller.v1/tensorflow/controller.go b/pkg/controller.v1/tensorflow/controller.go deleted file mode 100644 index 4e6285a6a3..0000000000 --- a/pkg/controller.v1/tensorflow/controller.go +++ /dev/null @@ -1,409 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package tensorflow provides a Kubernetes controller for a TFJob resource. -package tensorflow - -import ( - "context" - "fmt" - "time" - - "github.com/kubeflow/tf-operator/pkg/common/util" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - - log "github.com/sirupsen/logrus" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/wait" - kubeinformers "k8s.io/client-go/informers" - kubeclientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/tools/cache" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "github.com/kubeflow/common/pkg/controller.v1/common" - tflogger "github.com/kubeflow/common/pkg/util" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - tfjobscheme "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" - tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" - tfjobinformersv1 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/tensorflow/v1" - tfjoblisters "github.com/kubeflow/tf-operator/pkg/client/listers/tensorflow/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" -) - -const ( - controllerName = "tf-operator" - - // labels for pods and servers. - tfReplicaTypeLabel = "replica-type" - tfReplicaIndexLabel = "replica-index" - labelGroupName = "group-name" - // Deprecated label for backwards compatibility. Has to be removed - labelTFJobName = "tf-job-name" - // volcanoTaskSpecKey task spec key used in pod annotation when EnableGangScheduling is true - volcanoTaskSpecKey = "volcano.sh/task-spec" -) - -var ( - // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. - // IndexerInformer uses a delta queue, therefore for deletes we have to use this - // key function but it should be just fine for non delete events. - KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc - - tfJobsDeletedCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "tf_operator_jobs_deleted_total", - Help: "Counts number of TF jobs deleted", - }, - []string{"job_namespace"}, - ) -) - -// TFController is the type for TFJob Controller, which manages -// the lifecycle of TFJobs. -type TFController struct { - common.JobController - - // tfJobClientSet is a clientset for CRD TFJob. - tfJobClientSet tfjobclientset.Interface - - // To allow injection of sync functions for testing. - syncHandler func(string) (bool, error) - - // tfJobInformer is a temporary field for unstructured informer support. - tfJobInformer cache.SharedIndexInformer - - // Listers for TFJob, Pod and Service - // tfJobLister can list/get tfjobs from the shared informer's store. - tfJobLister tfjoblisters.TFJobLister - - // tfJobInformerSynced returns true if the tfjob store has been synced at least once. - tfJobInformerSynced cache.InformerSynced -} - -// NewTFController returns a new TFJob controller. -func NewTFController( - // This variable is for unstructured informer. - tfJobInformer tfjobinformersv1.TFJobInformer, - kubeClientSet kubeclientset.Interface, - volcanoClientSet volcanoclient.Interface, - tfJobClientSet tfjobclientset.Interface, - kubeInformerFactory kubeinformers.SharedInformerFactory, - // This field is not used now but we keep it since it will be used - // after we support CRD validation. - tfJobInformerFactory tfjobinformers.SharedInformerFactory, - option options.ServerOption) *TFController { - - err := tfjobscheme.AddToScheme(scheme.Scheme) - if err != nil { - log.Fatalf("Failed to add tfjob scheme: %v", err) - } - - log.Info("Creating TFJob controller") - // Create new TFController. - tc := &TFController{ - tfJobClientSet: tfJobClientSet, - } - - // Create base controller - log.Info("Creating Job controller") - - jc := common.NewJobController(tc, metav1.Duration{Duration: 15 * time.Second}, - option.EnableGangScheduling, kubeClientSet, volcanoClientSet, kubeInformerFactory, tfv1.Plural) - - // Set sync handler. - tc.syncHandler = tc.syncTFJob - - // TODO(ChanYiLin): these are originally for testing, but with using common library, - // we can not replcae the function. Also need to update or remove some tests - - // tc.updateStatusHandler = tc.UpdateJobStatusInApiServer - // set delete handler. - // tc.deleteTFJobHandler = tc.DeleteJob - - // Set up an event handler for when tfjob resources change. - tfJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: tc.addTFJob, - UpdateFunc: tc.updateTFJob, - // This will enter the sync loop and no-op, - // because the tfjob has been deleted from the store. - DeleteFunc: tc.enqueueTFJob, - }) - - tc.tfJobInformer = tfJobInformer.Informer() - tc.tfJobLister = tfJobInformer.Lister() - tc.tfJobInformerSynced = tfJobInformer.Informer().HasSynced - - // Create pod informer. - podInformer := kubeInformerFactory.Core().V1().Pods() - - // Set up an event handler for when pod resources change - podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: jc.AddPod, - UpdateFunc: jc.UpdatePod, - DeleteFunc: jc.DeletePod, - }) - - // tc.PodLister = podInformer.Lister() - // tc.PodInformerSynced = podInformer.Informer().HasSynced - jc.PodLister = podInformer.Lister() - jc.PodInformerSynced = podInformer.Informer().HasSynced - - // Create service informer. - serviceInformer := kubeInformerFactory.Core().V1().Services() - - // Set up an event handler for when service resources change. - serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: jc.AddService, - UpdateFunc: jc.UpdateService, - DeleteFunc: jc.DeleteService, - }) - - // tc.ServiceLister = serviceInformer.Lister() - // tc.ServiceInformerSynced = serviceInformer.Informer().HasSynced - jc.ServiceLister = serviceInformer.Lister() - jc.ServiceInformerSynced = serviceInformer.Informer().HasSynced - - tc.JobController = jc - - return tc -} - -// Run will set up the event handlers for types we are interested in, as well -// as syncing informer caches and starting workers. It will block until stopCh -// is closed, at which point it will shutdown the workqueue and wait for -// workers to finish processing their current work items. -func (tc *TFController) Run(threadiness int, stopCh <-chan struct{}) error { - defer utilruntime.HandleCrash() - defer tc.WorkQueue.ShutDown() - - // Start the informer factories to begin populating the informer caches. - log.Info("Starting TFJob controller") - - // Wait for the caches to be synced before starting workers. - log.Info("Waiting for informer caches to sync") - - if ok := cache.WaitForCacheSync(stopCh, tc.tfJobInformerSynced, - tc.PodInformerSynced, tc.ServiceInformerSynced); !ok { - return fmt.Errorf("failed to wait for caches to sync") - } - log.Infof("Starting %v workers", threadiness) - // Launch workers to process TFJob resources. - for i := 0; i < threadiness; i++ { - go wait.Until(tc.runWorker, time.Second, stopCh) - } - - log.Info("Started workers") - <-stopCh - log.Info("Shutting down workers") - - return nil -} - -// runWorker is a long-running function that will continually call the -// processNextWorkItem function in order to read and process a message on the -// workqueue. -func (tc *TFController) runWorker() { - for tc.processNextWorkItem() { - } -} - -// processNextWorkItem will read a single work item off the workqueue and -// attempt to process it, by calling the syncHandler. -func (tc *TFController) processNextWorkItem() bool { - obj, quit := tc.WorkQueue.Get() - if quit { - return false - } - defer tc.WorkQueue.Done(obj) - - var key string - var ok bool - if key, ok = obj.(string); !ok { - // As the item in the workqueue is actually invalid, we call - // Forget here else we'd go into a loop of attempting to - // process a work item that is invalid. - tc.WorkQueue.Forget(obj) - utilruntime.HandleError(fmt.Errorf("expected string in workqueue but got %#v", obj)) - return true - } - logger := tflogger.LoggerForKey(key) - - tfJob, err := tc.getTFJobFromKey(key) - if err != nil { - if err == errNotExists { - logger.Infof("TFJob has been deleted: %v", key) - namespace, _, keyerr := cache.SplitMetaNamespaceKey(key) - if keyerr == nil && len(namespace) != 0 { - tfJobsDeletedCount.WithLabelValues(namespace).Inc() - } else { - logger.Errorf("Invalid TFJob key %s: Namespace is missing %v", key, keyerr) - } - return true - } - - // Log the failure to conditions. - logger.Errorf("Failed to get TFJob from key %s: %v", key, err) - if err == errFailedMarshal { - errMsg := fmt.Sprintf("Failed to unmarshal the object to TFJob object: %v", err) - tflogger.LoggerForJob(tfJob).Warn(errMsg) - tc.Recorder.Event(tfJob, v1.EventTypeWarning, failedMarshalTFJobReason, errMsg) - } - - return true - } - - // Sync TFJob to match the actual state to this desired state. - forget, err := tc.syncHandler(key) - if err == nil { - if forget { - tc.WorkQueue.Forget(key) - } - return true - } - - utilruntime.HandleError(fmt.Errorf("error syncing tfjob: %v", err)) - tc.WorkQueue.AddRateLimited(key) - - return true -} - -func (tc *TFController) enqueueTFJob(tfjob interface{}) { - key, err := KeyFunc(tfjob) - if err != nil { - utilruntime.HandleError(fmt.Errorf("couldn't get key for tfjob object %#v: %v", tfjob, err)) - return - } - - // TODO: we may need add backoff here - tc.WorkQueue.Add(key) -} - -// syncTFJob syncs the tfjob with the given key if it has had its expectations fulfilled, meaning -// it did not expect to see any more of its pods/services created or deleted. -// This function is not meant to be invoked concurrently with the same key. -func (tc *TFController) syncTFJob(key string) (bool, error) { - startTime := time.Now() - logger := tflogger.LoggerForKey(key) - defer func() { - logger.Infof("Finished syncing tfjob %q (%v)", key, time.Since(startTime)) - }() - - namespace, name, err := cache.SplitMetaNamespaceKey(key) - if err != nil { - return false, err - } - if len(namespace) == 0 || len(name) == 0 { - return false, fmt.Errorf("invalid tfjob key %q: either namespace or name is missing", key) - } - - sharedTFJob, err := tc.getTFJobFromName(namespace, name) - if err != nil { - if err == errNotExists { - logger.Infof("TFJob has been deleted: %v", key) - tfJobsDeletedCount.WithLabelValues(namespace).Inc() - return true, nil - } - return false, err - } - - tfjob := sharedTFJob.DeepCopy() - - // Sync tfjob every time if EnableDynamicWorker is true - jobKey, err := common.KeyFunc(tfjob) - if err != nil { - utilruntime.HandleError(fmt.Errorf("couldn't get jobKey for job object %#v: %v", tfjob, err)) - } - - replicaTypes := util.GetReplicaTypes(tfjob.Spec.TFReplicaSpecs) - tfjobNeedsSync := tfjob.Spec.EnableDynamicWorker || util.SatisfiedExpectations(tc.Expectations, jobKey, replicaTypes) - - // Set default for the new tfjob. - scheme.Scheme.Default(tfjob) - - var reconcileTFJobsErr error - if tfjobNeedsSync && tfjob.DeletionTimestamp == nil { - reconcileTFJobsErr = tc.ReconcileJobs(tfjob, tfjob.Spec.TFReplicaSpecs, tfjob.Status, &tfjob.Spec.RunPolicy) - } - - if reconcileTFJobsErr != nil { - return false, reconcileTFJobsErr - } - - return true, err -} - -func (tc *TFController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) { - return tc.getTFJobFromName(namespace, name) -} - -func (tc *TFController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) { - return tc.tfJobClientSet.KubeflowV1().TFJobs(namespace).Get(context.TODO(), name, metav1.GetOptions{}) -} - -func (tc *TFController) GetAPIGroupVersionKind() schema.GroupVersionKind { - return tfv1.GroupVersion.WithKind(tfv1.Kind) -} - -func (tc *TFController) GetAPIGroupVersion() schema.GroupVersion { - return tfv1.GroupVersion -} - -func (tc *TFController) GetGroupNameLabelKey() string { - return labelGroupName -} - -// Deprecated function for backwards compatibility. Has to be removed later -func (tc *TFController) GetJobNameLabelKey() string { - return labelTFJobName -} - -func (tc *TFController) GetGroupNameLabelValue() string { - return tfv1.GroupVersion.Group -} - -func (tc *TFController) GetReplicaTypeLabelKey() string { - return tfReplicaTypeLabel -} - -func (tc *TFController) GetReplicaIndexLabelKey() string { - return tfReplicaIndexLabel -} - -func (tc *TFController) ControllerName() string { - return controllerName -} - -func (tc *TFController) GetDefaultContainerName() string { - return tfv1.DefaultContainerName -} - -func (tc *TFController) GetDefaultContainerPortName() string { - return tfv1.DefaultPortName -} - -func (tc *TFController) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool { - - if ContainChieforMasterSpec(replicas) { - return rtype == tfv1.TFReplicaTypeChief || rtype == tfv1.TFReplicaTypeMaster - } - // else check if it is worker with index 0 - return rtype == tfv1.TFReplicaTypeWorker && index == 0 -} diff --git a/pkg/controller.v1/tensorflow/controller_test.go b/pkg/controller.v1/tensorflow/controller_test.go deleted file mode 100644 index 45763fb751..0000000000 --- a/pkg/controller.v1/tensorflow/controller_test.go +++ /dev/null @@ -1,379 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package controller provides a Kubernetes controller for a TFJob resource. -package tensorflow - -import ( - "testing" - "time" - - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - kubeinformers "k8s.io/client-go/informers" - kubeclientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - batchv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "github.com/kubeflow/common/pkg/controller.v1/control" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" - "github.com/kubeflow/tf-operator/pkg/common/util/v1/testutil" -) - -var ( - tfJobRunning = commonv1.JobRunning - tfJobSucceeded = commonv1.JobSucceeded -) - -func newTFController( - config *rest.Config, - kubeClientSet kubeclientset.Interface, - volcanoClientSet volcanoclient.Interface, - tfJobClientSet tfjobclientset.Interface, - duration time.Duration, - option options.ServerOption, -) ( - *TFController, - kubeinformers.SharedInformerFactory, tfjobinformers.SharedInformerFactory, -) { - kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClientSet, duration) - tfJobInformerFactory := tfjobinformers.NewSharedInformerFactory(tfJobClientSet, duration) - - tfJobInformer := NewUnstructuredTFJobInformer(config, metav1.NamespaceAll, time.Hour*12) - - ctr := NewTFController(tfJobInformer, kubeClientSet, - volcanoClientSet, tfJobClientSet, kubeInformerFactory, - tfJobInformerFactory, option) - ctr.PodControl = &control.FakePodControl{} - ctr.ServiceControl = &control.FakeServiceControl{} - return ctr, kubeInformerFactory, tfJobInformerFactory -} - -func TestNormalPath(t *testing.T) { - testCases := map[string]struct { - worker int - ps int - - // pod setup - // ControllerError error - // jobKeyForget bool - - pendingWorkerPods int32 - activeWorkerPods int32 - succeededWorkerPods int32 - failedWorkerPods int32 - - pendingPSPods int32 - activePSPods int32 - succeededPSPods int32 - failedPSPods int32 - - activeWorkerServices int32 - activePSServices int32 - - // expectations - expectedPodCreations int32 - expectedPodDeletions int32 - expectedServiceCreations int32 - - expectedActiveWorkerPods int32 - expectedSucceededWorkerPods int32 - expectedFailedWorkerPods int32 - - expectedActivePSPods int32 - expectedSucceededPSPods int32 - expectedFailedPSPods int32 - - expectedCondition *commonv1.JobConditionType - expectedConditionReason string - - // There are some cases that should not check start time since the field should be set in the previous sync loop. - needCheckStartTime bool - }{ - "Local TFJob is created": { - 1, 0, - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, - 1, 0, 1, - 0, 0, 0, - 0, 0, 0, - // We can not check if it is created since the condition is set in addTFJob. - nil, "", - false, - }, - "Distributed TFJob (4 workers, 2 PS) is created": { - 4, 2, - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, - 6, 0, 6, - 0, 0, 0, - 0, 0, 0, - nil, "", - false, - }, - "Distributed TFJob (4 workers, 2 PS) is created and all replicas are pending": { - 4, 2, - 4, 0, 0, 0, - 2, 0, 0, 0, - 4, 2, - 0, 0, 0, - 0, 0, 0, - 0, 0, 0, - nil, "", - false, - }, - "Distributed TFJob (4 workers, 2 PS) is created and all replicas are running": { - 4, 2, - 0, 4, 0, 0, - 0, 2, 0, 0, - 4, 2, - 0, 0, 0, - 4, 0, 0, - 2, 0, 0, - &tfJobRunning, tfJobRunningReason, - true, - }, - "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending": { - 4, 2, - 2, 0, 0, 0, - 1, 0, 0, 0, - 2, 1, - 3, 0, 3, - 0, 0, 0, - 0, 0, 0, - nil, "", - false, - }, - "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is running": { - 4, 2, - 2, 1, 0, 0, - 1, 0, 0, 0, - 3, 1, - 2, 0, 2, - 1, 0, 0, - 0, 0, 0, - &tfJobRunning, tfJobRunningReason, - false, - }, - "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is succeeded": { - 4, 2, - 2, 0, 1, 0, - 1, 0, 0, 0, - 3, 1, - 2, 0, 2, - 0, 1, 0, - 0, 0, 0, - nil, "", - false, - }, - "Distributed TFJob (4 workers, 2 PS) is succeeded": { - 4, 2, - 0, 0, 4, 0, - 0, 0, 2, 0, - 4, 2, - 0, 0, 0, - 0, 4, 0, - 0, 2, 0, - &tfJobSucceeded, tfJobSucceededReason, - false, - }, - } - - for name, tc := range testCases { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - option := options.ServerOption{} - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, volcanoClientSet, tfJobClientSet, 0, option) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - // Run the test logic. - tfJob := testutil.NewTFJob(tc.worker, tc.ps) - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - testutil.SetPodsStatuses(podIndexer, tfJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, t) - testutil.SetPodsStatuses(podIndexer, tfJob, testutil.LabelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, nil, t) - - serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() - testutil.SetServices(serviceIndexer, tfJob, testutil.LabelWorker, tc.activeWorkerServices, t) - testutil.SetServices(serviceIndexer, tfJob, testutil.LabelPS, tc.activePSServices, t) - - //_, err = ctr.syncTFJob(testutil.GetKey(tfJob, t)) - _ = ctr.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) - - fakePodControl := ctr.PodControl.(*control.FakePodControl) - fakeServiceControl := ctr.ServiceControl.(*control.FakeServiceControl) - if int32(len(fakePodControl.Templates)) != tc.expectedPodCreations { - t.Errorf("%s: unexpected number of pod creates. Expected %d, saw %d\n", name, tc.expectedPodCreations, len(fakePodControl.Templates)) - } - if int32(len(fakeServiceControl.Templates)) != tc.expectedServiceCreations { - t.Errorf("%s: unexpected number of service creates. Expected %d, saw %d\n", name, tc.expectedServiceCreations, len(fakeServiceControl.Templates)) - } - if int32(len(fakePodControl.DeletePodName)) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", name, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) - } - // Each create should have an accompanying ControllerRef. - if len(fakePodControl.ControllerRefs) != int(tc.expectedPodCreations) { - t.Errorf("%s: unexpected number of ControllerRefs. Expected %d, saw %d\n", name, tc.expectedPodCreations, len(fakePodControl.ControllerRefs)) - } - // Make sure the ControllerRefs are correct. - for _, controllerRef := range fakePodControl.ControllerRefs { - if got, want := controllerRef.APIVersion, tfv1.GroupVersion.String(); got != want { - t.Errorf("controllerRef.APIVersion = %q, want %q", got, want) - } - if got, want := controllerRef.Kind, tfv1.Kind; got != want { - t.Errorf("controllerRef.Kind = %q, want %q", got, want) - } - if got, want := controllerRef.Name, tfJob.Name; got != want { - t.Errorf("controllerRef.Name = %q, want %q", got, want) - } - if got, want := controllerRef.UID, tfJob.UID; got != want { - t.Errorf("controllerRef.UID = %q, want %q", got, want) - } - if controllerRef.Controller == nil || !*controllerRef.Controller { - t.Errorf("controllerRef.Controller is not set to true") - } - } - // Validate worker status. - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)] != nil { - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Active != tc.expectedActiveWorkerPods { - t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", - name, tc.expectedActiveWorkerPods, - tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Active) - } - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Succeeded != tc.expectedSucceededWorkerPods { - t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", - name, tc.expectedSucceededWorkerPods, - tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Succeeded) - } - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Failed != tc.expectedFailedWorkerPods { - t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", - name, tc.expectedFailedWorkerPods, - tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Failed) - } - } - // Validate PS status. - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)] != nil { - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)].Active != tc.expectedActivePSPods { - t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", - name, tc.expectedActivePSPods, - tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)].Active) - } - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)].Succeeded != tc.expectedSucceededPSPods { - t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", - name, tc.expectedSucceededPSPods, - tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)].Succeeded) - } - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)].Failed != tc.expectedFailedPSPods { - t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", - name, tc.expectedFailedPSPods, - tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypePS)].Failed) - } - } - // Validate StartTime. - if tc.needCheckStartTime && tfJob.Status.StartTime == nil { - t.Errorf("%s: StartTime was not set", name) - } - // Validate conditions. - if tc.expectedCondition != nil && !testutil.CheckCondition(tfJob, *tc.expectedCondition, tc.expectedConditionReason) { - t.Errorf("%s: expected condition %#v, got %#v", name, *tc.expectedCondition, tfJob.Status.Conditions) - } - } -} - -func TestRun(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - - stopCh := make(chan struct{}) - go func() { - // It is a hack to let the controller stop to run without errors. - // We can not just send a struct to stopCh because there are multiple - // receivers in controller.Run. - time.Sleep(testutil.SleepInterval) - stopCh <- struct{}{} - }() - err := ctr.Run(testutil.ThreadCount, stopCh) - if err != nil { - t.Errorf("Failed to run: %v", err) - } -} diff --git a/pkg/controller.v1/tensorflow/informer.go b/pkg/controller.v1/tensorflow/informer.go deleted file mode 100644 index 03e8207709..0000000000 --- a/pkg/controller.v1/tensorflow/informer.go +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2021 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -package tensorflow - -import ( - "fmt" - "time" - - log "github.com/sirupsen/logrus" - metav1unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/dynamic" - restclientset "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" - - tflogger "github.com/kubeflow/common/pkg/util" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/validation" - tfjobinformers "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions" - tfjobinformersv1 "github.com/kubeflow/tf-operator/pkg/client/informers/externalversions/tensorflow/v1" - "github.com/kubeflow/tf-operator/pkg/common/util/v1/unstructured" -) - -const ( - failedMarshalMsg = "Failed to marshal the object to TFJob: %v" -) - -var ( - errGetFromKey = fmt.Errorf("failed to get TFJob from key") - errNotExists = fmt.Errorf("the object is not found") - errFailedMarshal = fmt.Errorf("failed to marshal the object to TFJob") -) - -func NewUnstructuredTFJobInformer(restConfig *restclientset.Config, namespace string, resyncPeriod time.Duration) tfjobinformersv1.TFJobInformer { - dclient, err := dynamic.NewForConfig(restConfig) - if err != nil { - panic(err) - } - - resource := schema.GroupVersionResource{ - Group: tfv1.GroupVersion.Group, - Version: tfv1.GroupVersion.Version, - Resource: "tfjobs", - } - - informer := unstructured.NewTFJobInformer( - resource, - dclient, - namespace, - resyncPeriod, - cache.Indexers{}, - ) - return informer -} - -// NewTFJobInformer returns TFJobInformer from the given factory. -func (tc *TFController) NewTFJobInformer(tfJobInformerFactory tfjobinformers.SharedInformerFactory) tfjobinformersv1.TFJobInformer { - return tfJobInformerFactory.Kubeflow().V1().TFJobs() -} - -func (tc *TFController) getTFJobFromName(namespace, name string) (*tfv1.TFJob, error) { - key := fmt.Sprintf("%s/%s", namespace, name) - return tc.getTFJobFromKey(key) -} - -func (tc *TFController) getTFJobFromKey(key string) (*tfv1.TFJob, error) { - // Check if the key exists. - obj, exists, err := tc.tfJobInformer.GetIndexer().GetByKey(key) - logger := tflogger.LoggerForKey(key) - if err != nil { - logger.Errorf("Failed to get TFJob '%s' from informer index: %+v", key, err) - return nil, errGetFromKey - } - if !exists { - // This happens after a tfjob was deleted, but the work queue still had an entry for it. - return nil, errNotExists - } - - return tfJobFromUnstructured(obj) -} - -func tfJobFromUnstructured(obj interface{}) (*tfv1.TFJob, error) { - // Check if the spec is valid. - un, ok := obj.(*metav1unstructured.Unstructured) - if !ok { - log.Errorf("The object in index is not an unstructured; %+v", obj) - return nil, errGetFromKey - } - var tfjob tfv1.TFJob - err := runtime.DefaultUnstructuredConverter.FromUnstructured(un.Object, &tfjob) - logger := tflogger.LoggerForUnstructured(un, tfv1.Kind) - if err != nil { - logger.Errorf(failedMarshalMsg, err) - return nil, errFailedMarshal - } - // This is a simple validation for TFJob to close - // https://github.com/kubeflow/tf-operator/issues/641 - // TODO(gaocegege): Add more validation here. - err = validation.ValidateV1TFJobSpec(&tfjob.Spec) - if err != nil { - logger.Errorf(failedMarshalMsg, err) - return nil, errFailedMarshal - } - return &tfjob, nil -} - -func unstructuredFromTFJob(obj interface{}, tfJob *tfv1.TFJob) error { - un, ok := obj.(*metav1unstructured.Unstructured) - logger := tflogger.LoggerForJob(tfJob) - if !ok { - logger.Warn("The object in index isn't type Unstructured") - return errGetFromKey - } - - var err error - un.Object, err = runtime.DefaultUnstructuredConverter.ToUnstructured(tfJob) - if err != nil { - logger.Error("The TFJob convert failed") - return err - } - return nil - -} diff --git a/pkg/controller.v1/tensorflow/job.go b/pkg/controller.v1/tensorflow/job.go deleted file mode 100644 index f1bbee0e0f..0000000000 --- a/pkg/controller.v1/tensorflow/job.go +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright 2021 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License - -package tensorflow - -import ( - "context" - "fmt" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - log "github.com/sirupsen/logrus" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - metav1unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/client-go/kubernetes/scheme" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - commonutil "github.com/kubeflow/common/pkg/util" - "github.com/kubeflow/common/pkg/util/k8sutil" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - "k8s.io/apimachinery/pkg/runtime" -) - -const ( - failedMarshalTFJobReason = "InvalidTFJobSpec" - FailedDeleteJobReason = "FailedDeleteJob" - SuccessfulDeleteJobReason = "SuccessfulDeleteJob" -) - -var ( - tfJobsCreatedCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "tf_operator_jobs_created_total", - Help: "Counts number of TF jobs created", - }, - []string{"job_namespace"}, - ) -) - -// DeleteJob implements ControllerInterface interface. -func (tc *TFController) DeleteJob(job interface{}) error { - tfJob, ok := job.(*tfv1.TFJob) - if !ok { - return fmt.Errorf("%v is not a type of TFJob", tfJob) - } - - log := commonutil.LoggerForJob(tfJob) - if err := tc.tfJobClientSet.KubeflowV1().TFJobs(tfJob.Namespace).Delete(context.TODO(), tfJob.Name, metav1.DeleteOptions{}); err != nil { - tc.JobController.Recorder.Eventf(tfJob, v1.EventTypeWarning, FailedDeleteJobReason, "Error deleting: %v", err) - log.Errorf("failed to delete job %s/%s, %v", tfJob.Namespace, tfJob.Name, err) - return err - } - - tc.JobController.Recorder.Eventf(tfJob, v1.EventTypeNormal, SuccessfulDeleteJobReason, "Deleted job: %v", tfJob.Name) - log.Infof("job %s/%s has been deleted", tfJob.Namespace, tfJob.Name) - return nil -} - -// addTFJob sets the defaults and enqueue the current tfjob. -func (tc *TFController) addTFJob(obj interface{}) { - // Convert from unstructured object. - tfJob, err := tfJobFromUnstructured(obj) - if err != nil { - un, ok := obj.(*metav1unstructured.Unstructured) - logger := &log.Entry{} - if ok { - logger = commonutil.LoggerForUnstructured(un, tfv1.Kind) - } - logger.Errorf("Failed to convert the TFJob: %v", err) - // Log the failure to conditions. - if err == errFailedMarshal { - errMsg := fmt.Sprintf("Failed to marshal the object to TFJob; the spec is invalid: %v", err) - logger.Warn(errMsg) - // TODO(jlewi): v1 doesn't appear to define an error type. - tc.Recorder.Event(un, v1.EventTypeWarning, failedMarshalTFJobReason, errMsg) - - status := commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{ - { - Type: commonv1.JobFailed, - Status: v1.ConditionTrue, - LastUpdateTime: metav1.Now(), - LastTransitionTime: metav1.Now(), - Reason: failedMarshalTFJobReason, - Message: errMsg, - }, - }, - } - - statusMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&status) - - if err != nil { - logger.Errorf("Could not covert the TFJobStatus to unstructured; %v", err) - return - } - - client, err := k8sutil.NewCRDRestClient(&tfv1.GroupVersion) - - if err == nil { - if err1 := metav1unstructured.SetNestedField(un.Object, statusMap, "status"); err1 != nil { - logger.Errorf("Could not set nested field: %v", err1) - } - logger.Infof("Updating the job to: %+v", un.Object) - err = client.UpdateStatus(un, tfv1.Plural) - if err != nil { - logger.Errorf("Could not update the TFJob: %v", err) - } - } else { - logger.Errorf("Could not create a REST client to update the TFJob") - } - } - return - } - - // Set default for the new tfjob. - // TODO(Jeffwan): Consider to change to scheme https://github.com/kubeflow/tf-operator/issues/1317#issuecomment-890397705 - tfv1.SetDefaults_TFJob(tfJob) - scheme.Scheme.Default(tfJob) - - msg := fmt.Sprintf("TFJob %s is created.", tfJob.Name) - logger := commonutil.LoggerForJob(tfJob) - logger.Info(msg) - - // Add a created condition. - err = commonutil.UpdateJobConditions(&tfJob.Status, commonv1.JobCreated, tfJobCreatedReason, msg) - if err != nil { - logger.Errorf("Append tfJob condition error: %v", err) - return - } - - // Convert from tfjob object - err = unstructuredFromTFJob(obj, tfJob) - if err != nil { - logger.Errorf("Failed to convert the obj: %v", err) - return - } - tc.enqueueTFJob(obj) - tfJobsCreatedCount.WithLabelValues(tfJob.Namespace).Inc() -} - -// updateTFJob enqueues the current tfjob. -func (tc *TFController) updateTFJob(old, cur interface{}) { - oldTFJob, err := tfJobFromUnstructured(old) - if err != nil { - return - } - curTFJob, err := tfJobFromUnstructured(cur) - if err != nil { - return - } - - // never return error - key, err := KeyFunc(curTFJob) - if err != nil { - return - } - - log.Infof("Updating tfjob: %s", oldTFJob.Name) - tc.enqueueTFJob(cur) - - // check if need to add a new rsync for ActiveDeadlineSeconds - if curTFJob.Status.StartTime != nil { - curTFJobADS := curTFJob.Spec.RunPolicy.ActiveDeadlineSeconds - if curTFJobADS == nil { - return - } - oldTFJobADS := oldTFJob.Spec.RunPolicy.ActiveDeadlineSeconds - if oldTFJobADS == nil || *oldTFJobADS != *curTFJobADS { - now := metav1.Now() - start := curTFJob.Status.StartTime.Time - passed := now.Time.Sub(start) - total := time.Duration(*curTFJobADS) * time.Second - // AddAfter will handle total < passed - tc.WorkQueue.AddAfter(key, total-passed) - log.Infof("job ActiveDeadlineSeconds updated, will rsync after %d seconds", total-passed) - } - } -} diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go deleted file mode 100644 index f5b0df4e6d..0000000000 --- a/pkg/controller.v1/tensorflow/job_test.go +++ /dev/null @@ -1,810 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package tensorflow - -import ( - "testing" - "time" - - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - kubeclientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/record" - batchv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" - - common "github.com/kubeflow/common/pkg/apis/common/v1" - "github.com/kubeflow/common/pkg/controller.v1/control" - commonutil "github.com/kubeflow/common/pkg/util" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/tf-operator/pkg/common/util/v1/testutil" -) - -func TestAddTFJob(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, - options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - var key string - syncChan := make(chan string) - ctr.syncHandler = func(tfJobKey string) (bool, error) { - key = tfJobKey - <-syncChan - return true, nil - } - - tfJob := testutil.NewTFJob(1, 0) - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - ctr.addTFJob(unstructured) - - syncChan <- "sync" - if key != testutil.GetKey(tfJob, t) { - t.Errorf("Failed to enqueue the TFJob %s: expected %s, got %s", tfJob.Name, testutil.GetKey(tfJob, t), key) - } - close(stopCh) -} - -func TestCopyLabelsAndAnnotation(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - tfJob := testutil.NewTFJob(1, 0) - annotations := map[string]string{ - "annotation1": "1", - } - labels := map[string]string{ - "label1": "1", - } - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].Template.Labels = labels - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].Template.Annotations = annotations - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - - _ = ctr.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) - - if len(fakePodControl.Templates) != 1 { - t.Errorf("Expected to create 1 pod while got %d", len(fakePodControl.Templates)) - } - actual := fakePodControl.Templates[0] - v, exist := actual.Labels["label1"] - if !exist { - t.Errorf("Labels does not exist") - } - if v != "1" { - t.Errorf("Labels value do not equal") - } - - v, exist = actual.Annotations["annotation1"] - if !exist { - t.Errorf("Annotations does not exist") - } - if v != "1" { - t.Errorf("Annotations value does not equal") - } - - close(stopCh) -} - -func TestDeletePodsAndServices(t *testing.T) { - type testCase struct { - description string - tfJob *tfv1.TFJob - - pendingWorkerPods int32 - activeWorkerPods int32 - succeededWorkerPods int32 - failedWorkerPods int32 - - pendingPSPods int32 - activePSPods int32 - succeededPSPods int32 - failedPSPods int32 - - activeWorkerServices int32 - activePSServices int32 - - expectedPodDeletions int - } - - testCases := []testCase{ - testCase{ - description: "4 workers and 2 ps is running, policy is all", - tfJob: testutil.NewTFJobWithCleanPolicy(0, 4, 2, common.CleanPodPolicyAll), - - pendingWorkerPods: 0, - activeWorkerPods: 4, - succeededWorkerPods: 0, - failedWorkerPods: 0, - - pendingPSPods: 0, - activePSPods: 2, - succeededPSPods: 0, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 6, - }, - testCase{ - description: "4 workers and 2 ps is running, policy is running", - tfJob: testutil.NewTFJobWithCleanPolicy(0, 4, 2, common.CleanPodPolicyRunning), - - pendingWorkerPods: 0, - activeWorkerPods: 4, - succeededWorkerPods: 0, - failedWorkerPods: 0, - - pendingPSPods: 0, - activePSPods: 2, - succeededPSPods: 0, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 6, - }, - testCase{ - description: "4 workers and 2 ps is succeeded, policy is running", - tfJob: testutil.NewTFJobWithCleanPolicy(0, 4, 2, common.CleanPodPolicyRunning), - - pendingWorkerPods: 0, - activeWorkerPods: 0, - succeededWorkerPods: 4, - failedWorkerPods: 0, - - pendingPSPods: 0, - activePSPods: 0, - succeededPSPods: 2, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 0, - }, - testCase{ - description: "4 workers and 2 ps is succeeded, policy is None", - tfJob: testutil.NewTFJobWithCleanPolicy(0, 4, 2, common.CleanPodPolicyNone), - - pendingWorkerPods: 0, - activeWorkerPods: 0, - succeededWorkerPods: 4, - failedWorkerPods: 0, - - pendingPSPods: 0, - activePSPods: 0, - succeededPSPods: 2, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 0, - }, - } - for _, tc := range testCases { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - fakeServiceControl := &control.FakeServiceControl{} - ctr.ServiceControl = fakeServiceControl - ctr.Recorder = &record.FakeRecorder{} - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - // Set succeeded to run the logic about deleting. - err := commonutil.UpdateJobConditions(&tc.tfJob.Status, common.JobSucceeded, tfJobSucceededReason, "") - if err != nil { - t.Errorf("Append tfjob condition error: %v", err) - } - - unstructured, err := testutil.ConvertTFJobToUnstructured(tc.tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, t) - testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, nil, t) - - serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() - testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelWorker, tc.activeWorkerServices, t) - testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelPS, tc.activePSServices, t) - - _ = ctr.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy) - // forget, err := ctr.syncTFJob(testutil.GetKey(tc.tfJob, t)) - // if err != nil { - // t.Errorf("%s: unexpected error when syncing jobs %v", tc.description, err) - // } - // if !forget { - // t.Errorf("%s: unexpected forget value. Expected true, saw %v\n", tc.description, forget) - // } - - if len(fakePodControl.DeletePodName) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) - } - if len(fakeServiceControl.DeleteServiceName) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of service deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakeServiceControl.DeleteServiceName)) - } - } -} - -// TODO(ChanYiLin): I have to remove this test since I can't overwrite the deleteTFJobHandler() function -// It is now in common library as part of controller interface - DeleteJob() -// func TestCleanupTFJob(t *testing.T) { -// type testCase struct { -// description string -// tfJob *tfv1.TFJob - -// pendingWorkerPods int32 -// activeWorkerPods int32 -// succeededWorkerPods int32 -// failedWorkerPods int32 - -// pendingPSPods int32 -// activePSPods int32 -// succeededPSPods int32 -// failedPSPods int32 - -// activeWorkerServices int32 -// activePSServices int32 - -// expectedDeleteFinished bool -// } - -// ttlaf0 := int32(0) -// ttl0 := &ttlaf0 -// ttlaf2s := int32(2) -// ttl2s := &ttlaf2s -// testCases := []testCase{ -// testCase{ -// description: "4 workers and 2 ps is running, TTLSecondsAfterFinished unset", -// tfJob: testutil.NewTFJobWithCleanupJobDelay(0, 4, 2, nil), - -// pendingWorkerPods: 0, -// activeWorkerPods: 4, -// succeededWorkerPods: 0, -// failedWorkerPods: 0, - -// pendingPSPods: 0, -// activePSPods: 2, -// succeededPSPods: 0, -// failedPSPods: 0, - -// activeWorkerServices: 4, -// activePSServices: 2, - -// expectedDeleteFinished: false, -// }, -// testCase{ -// description: "4 workers and 2 ps is running, TTLSecondsAfterFinished is 0", -// tfJob: testutil.NewTFJobWithCleanupJobDelay(0, 4, 2, ttl0), - -// pendingWorkerPods: 0, -// activeWorkerPods: 4, -// succeededWorkerPods: 0, -// failedWorkerPods: 0, - -// pendingPSPods: 0, -// activePSPods: 2, -// succeededPSPods: 0, -// failedPSPods: 0, - -// activeWorkerServices: 4, -// activePSServices: 2, - -// expectedDeleteFinished: true, -// }, -// testCase{ -// description: "4 workers and 2 ps is succeeded, TTLSecondsAfterFinished is 2", -// tfJob: testutil.NewTFJobWithCleanupJobDelay(0, 4, 2, ttl2s), - -// pendingWorkerPods: 0, -// activeWorkerPods: 0, -// succeededWorkerPods: 4, -// failedWorkerPods: 0, - -// pendingPSPods: 0, -// activePSPods: 0, -// succeededPSPods: 2, -// failedPSPods: 0, - -// activeWorkerServices: 4, -// activePSServices: 2, - -// expectedDeleteFinished: true, -// }, -// } -// for _, tc := range testCases { -// // Prepare the clientset and controller for the test. -// kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ -// Host: "", -// ContentConfig: rest.ContentConfig{ -// GroupVersion: &v1.SchemeGroupVersion, -// }, -// }, -// ) - -// // Prepare the volcano clientset and controller for the test. -// volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ -// Host: "", -// ContentConfig: rest.ContentConfig{ -// GroupVersion: &batchv1beta1.SchemeGroupVersion, -// }, -// }, -// ) - -// config := &rest.Config{ -// Host: "", -// ContentConfig: rest.ContentConfig{ -// GroupVersion: &tfv1.SchemeGroupVersion, -// }, -// } -// tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) -// ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, volcanoClientSet, tfJobClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) -// fakePodControl := &controller.FakePodControl{} -// ctr.PodControl = fakePodControl -// fakeServiceControl := &control.FakeServiceControl{} -// ctr.ServiceControl = fakeServiceControl -// ctr.Recorder = &record.FakeRecorder{} -// ctr.tfJobInformerSynced = testutil.AlwaysReady -// ctr.PodInformerSynced = testutil.AlwaysReady -// ctr.ServiceInformerSynced = testutil.AlwaysReady -// tfJobIndexer := ctr.tfJobInformer.GetIndexer() -// ctr.updateStatusHandler = func(job interface{}, jobStatus *commonv1.JobStatus) error { -// return nil -// } -// deleteFinished := false -// ctr.deleteTFJobHandler = func(tfJob *tfv1.TFJob) error { -// deleteFinished = true -// return nil -// } - -// // Set succeeded to run the logic about deleting. -// testutil.SetTFJobCompletionTime(tc.tfJob) -// err := commonutil.UpdateJobConditions(&tc.tfJob.Status, common.JobSucceeded, tfJobSucceededReason, "") -// if err != nil { -// t.Errorf("Append tfjob condition error: %v", err) -// } - -// unstructured, err := testutil.ConvertTFJobToUnstructured(tc.tfJob) -// if err != nil { -// t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) -// } - -// if err := tfJobIndexer.Add(unstructured); err != nil { -// t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) -// } - -// podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() -// testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, t) -// testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, nil, t) - -// serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() -// testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelWorker, tc.activeWorkerServices, t) -// testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelPS, tc.activePSServices, t) - -// ttl := tc.tfJob.Spec.RunPolicy.TTLSecondsAfterFinished -// if ttl != nil { -// dur := time.Second * time.Duration(*ttl) -// time.Sleep(dur) -// } - -// //forget, err := ctr.syncTFJob(testutil.GetKey(tc.tfJob, t)) -// _ = ctr.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) -// ctr.DeleteJob = func(job interface{}) error { -// deleteFinished = true -// return nil -// } -// // if err != nil { -// // t.Errorf("%s: unexpected error when syncing jobs %v", tc.description, err) -// // } -// // if !forget { -// // t.Errorf("%s: unexpected forget value. Expected true, saw %v\n", tc.description, forget) -// // } - -// if deleteFinished != tc.expectedDeleteFinished { -// t.Errorf("%s: unexpected status. Expected %v, saw %v", tc.description, tc.expectedDeleteFinished, deleteFinished) -// } -// } -// } - -func TestActiveDeadlineSeconds(t *testing.T) { - type testCase struct { - description string - tfJob *tfv1.TFJob - - pendingWorkerPods int32 - activeWorkerPods int32 - succeededWorkerPods int32 - failedWorkerPods int32 - - pendingPSPods int32 - activePSPods int32 - succeededPSPods int32 - failedPSPods int32 - - activeWorkerServices int32 - activePSServices int32 - - expectedPodDeletions int - } - - ads2 := int64(2) - adsTest2 := &ads2 - testCases := []testCase{ - testCase{ - description: "4 workers and 2 ps is running, ActiveDeadlineSeconds unset", - tfJob: testutil.NewTFJobWithActiveDeadlineSeconds(0, 4, 2, nil), - - pendingWorkerPods: 0, - activeWorkerPods: 4, - succeededWorkerPods: 0, - failedWorkerPods: 0, - - pendingPSPods: 0, - activePSPods: 2, - succeededPSPods: 0, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 0, - }, - testCase{ - description: "4 workers and 2 ps is running, ActiveDeadlineSeconds is 2", - tfJob: testutil.NewTFJobWithActiveDeadlineSeconds(0, 4, 2, adsTest2), - - pendingWorkerPods: 0, - activeWorkerPods: 4, - succeededWorkerPods: 0, - failedWorkerPods: 0, - - pendingPSPods: 0, - activePSPods: 2, - succeededPSPods: 0, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 6, - }, - } - for _, tc := range testCases { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - fakeServiceControl := &control.FakeServiceControl{} - ctr.ServiceControl = fakeServiceControl - ctr.Recorder = &record.FakeRecorder{} - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - unstructured, err := testutil.ConvertTFJobToUnstructured(tc.tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, t) - testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, nil, t) - - serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() - testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelWorker, tc.activeWorkerServices, t) - testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelPS, tc.activePSServices, t) - - foo, _ := ctr.getTFJobFromName("default", "test-tfjob") - now := metav1.Now() - foo.Status.StartTime = &now - - ads := tc.tfJob.Spec.RunPolicy.ActiveDeadlineSeconds - if ads != nil { - dur := time.Second * time.Duration(*ads) - time.Sleep(dur) - } - - _ = ctr.ReconcileJobs(foo, foo.Spec.TFReplicaSpecs, foo.Status, &foo.Spec.RunPolicy) - // if err != nil { - // t.Errorf("%s: unexpected error when syncing jobs %v", tc.description, err) - // } - - if len(fakePodControl.DeletePodName) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) - } - if len(fakeServiceControl.DeleteServiceName) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of service deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakeServiceControl.DeleteServiceName)) - } - } -} - -func TestBackoffForOnFailure(t *testing.T) { - type testCase struct { - description string - tfJob *tfv1.TFJob - - pendingWorkerPods int32 - activeWorkerPods int32 - succeededWorkerPods int32 - failedWorkerPods int32 - - restartCounts []int32 - - pendingPSPods int32 - activePSPods int32 - succeededPSPods int32 - failedPSPods int32 - - activeWorkerServices int32 - activePSServices int32 - - expectedPodDeletions int - } - - backoffLimit4 := int32(4) - backoffLimitTest4 := &backoffLimit4 - testCases := []testCase{ - testCase{ - description: "4 workers each having 1 restartCount and 2 ps is running, backoffLimit 4 ", - tfJob: testutil.NewTFJobWithBackoffLimit(0, 4, 2, backoffLimitTest4), - - pendingWorkerPods: 0, - activeWorkerPods: 4, - succeededWorkerPods: 0, - failedWorkerPods: 0, - - restartCounts: []int32{1, 1, 1, 1}, - - pendingPSPods: 0, - activePSPods: 2, - succeededPSPods: 0, - failedPSPods: 0, - - activeWorkerServices: 4, - activePSServices: 2, - - expectedPodDeletions: 6, - }, - } - for _, tc := range testCases { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - fakeServiceControl := &control.FakeServiceControl{} - ctr.ServiceControl = fakeServiceControl - ctr.Recorder = &record.FakeRecorder{} - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - unstructured, err := testutil.ConvertTFJobToUnstructured(tc.tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, tc.restartCounts, t) - testutil.SetPodsStatuses(podIndexer, tc.tfJob, testutil.LabelPS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, tc.restartCounts, t) - - serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() - testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelWorker, tc.activeWorkerServices, t) - testutil.SetServices(serviceIndexer, tc.tfJob, testutil.LabelPS, tc.activePSServices, t) - - _ = ctr.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy) - // forget, err := ctr.syncTFJob(testutil.GetKey(tc.tfJob, t)) - // if err != nil { - // t.Errorf("%s: unexpected error when syncing jobs %v", tc.description, err) - // } - // if !forget { - // t.Errorf("%s: unexpected forget value. Expected true, saw %v\n", tc.description, forget) - // } - - if len(fakePodControl.DeletePodName) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) - } - if len(fakeServiceControl.DeleteServiceName) != tc.expectedPodDeletions { - t.Errorf("%s: unexpected number of service deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakeServiceControl.DeleteServiceName)) - } - } -} diff --git a/pkg/controller.v1/tensorflow/pod.go b/pkg/controller.v1/tensorflow/pod.go deleted file mode 100644 index 37b9bdb729..0000000000 --- a/pkg/controller.v1/tensorflow/pod.go +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package controller provides a Kubernetes controller for a TFJob resource. -package tensorflow - -import ( - "fmt" - "strconv" - "strings" - - "github.com/kubeflow/tf-operator/pkg/common/util" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "github.com/kubeflow/common/pkg/controller.v1/common" - "github.com/kubeflow/common/pkg/controller.v1/expectation" - commonutil "github.com/kubeflow/common/pkg/util" - train_util "github.com/kubeflow/common/pkg/util/train" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" -) - -const ( - // gang scheduler name. - gangSchedulerName = "volcano" - // tfConfig is the environment variable name of TensorFlow cluster spec. - tfConfig = "TF_CONFIG" - // exitedWithCodeReason is the normal reason when the pod is exited because of the exit code. - exitedWithCodeReason = "ExitedWithCode" - // podTemplateRestartPolicyReason is the warning reason when the restart - // policy is set in pod template. - podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy" - // podTemplateSchedulerNameReason is the warning reason when other scheduler name is set - // in pod templates with gang-scheduling enabled - podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName" - // gangSchedulingPodGroupAnnotation is the annotation key used by batch schedulers - gangSchedulingPodGroupAnnotation = "scheduling.k8s.io/group-name" -) - -var ( - tfJobsRestartCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "tf_operator_jobs_restarted_total", - Help: "Counts number of TF jobs restarted", - }, - []string{"job_namespace"}, - ) -) - -// reconcilePods checks and updates pods for each given TFReplicaSpec. -// It will requeue the tfjob in case of an error while creating/deleting pods. -func (tc *TFController) ReconcilePods( - job interface{}, - jobStatus *commonv1.JobStatus, - pods []*v1.Pod, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, -) error { - - tfJob, ok := job.(*tfv1.TFJob) - if !ok { - return fmt.Errorf("%v is not a type of TFJob", tfJob) - } - - // Convert ReplicaType to lower string. - rt := strings.ToLower(string(rtype)) - logger := commonutil.LoggerForJob(tfJob) - // Get all pods for the type rt. - pods, err := tc.FilterPodsForReplicaType(pods, rt) - if err != nil { - return err - } - numReplicas := int(*spec.Replicas) - masterRole := false - //restart := false - //worker0Completed := false - - initializeReplicaStatuses(jobStatus, rtype) - - // GetPodSlices will return enough information here to make decision to add/remove/update resources. - // - // For example, let's assume we have pods with replica-index 0, 1, 2 - // If replica is 4, return a slice with size 4. [[0],[1],[2],[]], a pod with replica-index 3 will be created. - // - // If replica is 1, return a slice with size 3. [[0],[1],[2]], pod with replica-index 1 and 2 are out of range and will be deleted. - podSlices := tc.GetPodSlices(pods, numReplicas, logger) - for index, podSlice := range podSlices { - if len(podSlice) > 1 { - logger.Warningf("We have too many pods for %s %d", rt, index) - } else if len(podSlice) == 0 { - logger.Infof("Need to create new pod: %s-%d", rt, index) - - // check if this replica is the master role - masterRole = tc.IsMasterRole(replicas, rtype, index) - // TODO: [should change to CreateNewPod] - err = tc.createNewPod(tfJob, rt, strconv.Itoa(index), spec, masterRole, replicas) - if err != nil { - return err - } - } else { - // Check the status of the current pod. - pod := podSlice[0] - - // check if the index is in the valid range, if not, we should kill the pod - if index < 0 || index >= numReplicas { - err = tc.PodControl.DeletePod(pod.Namespace, pod.Name, tfJob) - if err != nil { - return err - } - } - // Get the exit code of the container. - var exitCode int32 = 0xbeef // magic number - for _, status := range pod.Status.ContainerStatuses { - state := status.State - if status.Name == tc.GetDefaultContainerName() && state.Terminated != nil { - exitCode = state.Terminated.ExitCode - logger.Infof("Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode) - tc.Recorder.Eventf(tfJob, v1.EventTypeNormal, exitedWithCodeReason, "Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode) - } - } - // Check if the pod is retryable. - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { - if pod.Status.Phase == v1.PodFailed && train_util.IsRetryableExitCode(exitCode) { - logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) - if err := tc.PodControl.DeletePod(pod.Namespace, pod.Name, tfJob); err != nil { - return err - } - - // with common library framework, we have to handle restart status here - // or we won't know which replica has been restarted in updateJobStatus after reconciling all replicas - msg := fmt.Sprintf("TFJob %s is restarting because %s replica(s) failed.", - tfJob.Name, rtype) - tc.Recorder.Event(tfJob, corev1.EventTypeWarning, tfJobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, tfJobRestartingReason, msg) - if err != nil { - commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) - return err - } - tfJobsRestartCount.WithLabelValues(tfJob.Namespace).Inc() - } - } - - updateJobReplicaStatuses(jobStatus, rtype, pod) - } - } - return nil -} - -// createNewPod creates a new pod for the given index and type. -func (tc *TFController) createNewPod(tfjob *tfv1.TFJob, rt, index string, spec *commonv1.ReplicaSpec, masterRole bool, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { - - tfjobKey, err := KeyFunc(tfjob) - if err != nil { - utilruntime.HandleError(fmt.Errorf("couldn't get key for tfjob object %#v: %v", tfjob, err)) - return err - } - expectationPodsKey := expectation.GenExpectationPodsKey(tfjobKey, rt) - err = tc.Expectations.ExpectCreations(expectationPodsKey, 1) - if err != nil { - return err - } - logger := commonutil.LoggerForReplica(tfjob, rt) - // Create OwnerReference. - controllerRef := tc.GenOwnerReference(tfjob) - - // Set type and index for the worker. - labels := tc.GenLabels(tfjob.Name) - labels[tfReplicaTypeLabel] = rt - labels[tfReplicaIndexLabel] = index - - if masterRole { - labels[commonv1.JobRoleLabel] = "master" - } - - podTemplate := spec.Template.DeepCopy() - - // Set name for the template. - podTemplate.Name = common.GenGeneralName(tfjob.Name, rt, index) - - if podTemplate.Labels == nil { - podTemplate.Labels = make(map[string]string) - } - - for key, value := range labels { - podTemplate.Labels[key] = value - } - - if err := tc.SetClusterSpec(tfjob, podTemplate, rt, index); err != nil { - return err - } - - // Submit a warning event if the user specifies restart policy for - // the pod template. We recommend to set it from the replica level. - if podTemplate.Spec.RestartPolicy != v1.RestartPolicy("") { - errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec" - logger.Warning(errMsg) - tc.Recorder.Event(tfjob, v1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg) - } - setRestartPolicy(podTemplate, spec) - - // if gang-scheduling is enabled: - // 1. if user has specified other scheduler, we report a warning without overriding any fields. - // 2. if no SchedulerName is set for pods, then we set the SchedulerName to "kube-batch". - if tc.Config.EnableGangScheduling { - if util.IsGangSchedulerSet(replicas, gangSchedulerName) { - errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten" - logger.Warning(errMsg) - tc.Recorder.Event(tfjob, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg) - } else { - podTemplate.Spec.SchedulerName = gangSchedulerName - } - - if podTemplate.Annotations == nil { - podTemplate.Annotations = map[string]string{} - } - podTemplate.Annotations[gangSchedulingPodGroupAnnotation] = tfjob.GetName() - podTemplate.Annotations[volcanoTaskSpecKey] = rt - } - - err = tc.PodControl.CreatePodsWithControllerRef(tfjob.Namespace, podTemplate, tfjob, controllerRef) - if err != nil && errors.IsTimeout(err) { - // Pod is created but its initialization has timed out. - // If the initialization is successful eventually, the - // controller will observe the creation via the informer. - // If the initialization fails, or if the pod keeps - // uninitialized for a long time, the informer will not - // receive any update, and the controller will create a new - // pod when the expectation expires. - return nil - } else if err != nil { - // Decrement the expected number of creates because the informer won't observe this pod - logger.Infof( - "Failed creation, decrementing expectations for tfjob %s/%s, key %s", - tfjob.Namespace, tfjob.Name, expectationPodsKey) - tc.Expectations.CreationObserved(expectationPodsKey) - return err - } - return nil -} - -// SetClusterSpec generates and sets TF_CONFIG for the given podTemplateSpec. -func (tc *TFController) SetClusterSpec(job interface{}, podTemplate *v1.PodTemplateSpec, rtype, index string) error { - tfjob, ok := job.(*tfv1.TFJob) - if !ok { - return fmt.Errorf("%v is not a type of MXJob", tfjob) - } - - // Do not set TF_CONFIG for local training jobs. - if !isDistributed(tfjob) { - return nil - } - // Generate TF_CONFIG JSON string. - tfConfigStr, err := genTFConfigJSONStr(tfjob, rtype, index) - if err != nil { - return err - } - - if tfConfigStr == "" { - return nil - } - // Add TF_CONFIG environment variable to tensorflow container in the pod. - for i := range podTemplate.Spec.Containers { - if podTemplate.Spec.Containers[i].Name == tfv1.DefaultContainerName { - if len(podTemplate.Spec.Containers[i].Env) == 0 { - podTemplate.Spec.Containers[i].Env = make([]v1.EnvVar, 0) - } - podTemplate.Spec.Containers[i].Env = append(podTemplate.Spec.Containers[i].Env, v1.EnvVar{ - Name: tfConfig, - Value: tfConfigStr, - }) - break - } - } - return nil -} - -// isDistributed returns if the TFJob is a distributed training job. -// Ref https://github.com/kubeflow/tf-operator/issues/1078. -func isDistributed(tfjob *tfv1.TFJob) bool { - replicas := tfjob.Spec.TFReplicaSpecs - distributionCount := 0 - allTypes := []commonv1.ReplicaType{ - tfv1.TFReplicaTypeChief, - tfv1.TFReplicaTypeEval, - tfv1.TFReplicaTypeMaster, - tfv1.TFReplicaTypePS, - tfv1.TFReplicaTypeWorker, - } - // Check if there is only one replica. - for _, typ := range allTypes { - if replicas[typ] != nil { - if replicas[typ].Replicas == nil { - distributionCount++ - } else { - distributionCount += int(*replicas[typ].Replicas) - } - } - } - return distributionCount != 1 -} - -func setRestartPolicy(podTemplateSpec *v1.PodTemplateSpec, spec *commonv1.ReplicaSpec) { - // This is necessary since restartPolicyExitCode is not supported in v1.PodTemplateSpec - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { - podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicyNever - } else { - podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicy(spec.RestartPolicy) - } -} - -func (tc *TFController) getPodSlices(tfjob *tfv1.TFJob, replicasNum *int32) ([][]*v1.Pod, error) { - logger := commonutil.LoggerForReplica(tfjob, strings.ToLower(string(tfv1.TFReplicaTypeWorker))) - - pods, err := tc.Controller.GetPodsForJob(tfjob) - if err != nil { - commonutil.LoggerForJob(tfjob).Warnf("getPodsForTFJob error %v", err) - return nil, err - } - - // Get all pods for the type rt. - pods, err = tc.FilterPodsForReplicaType(pods, strings.ToLower(string(tfv1.TFReplicaTypeWorker))) - if err != nil { - return nil, err - } - - podSlices := tc.GetPodSlices(pods, int(*replicasNum), logger) - return podSlices, nil -} - -func getContainerExitCode(pod *v1.Pod) int32 { - var exitCode int32 = 0xbeef // magic number - for _, status := range pod.Status.ContainerStatuses { - state := status.State - if status.Name == tfv1.DefaultContainerName && state.Terminated != nil { - exitCode = state.Terminated.ExitCode - } - } - return exitCode -} - -// IsWorker0Completed return true if pod of worker0 succeeded and exited with 0 -func (tc *TFController) IsWorker0Completed(tfjob *tfv1.TFJob, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) (bool, error) { - worker0Completed := false - _, ok := replicas[tfv1.TFReplicaTypeWorker] - if !ok { - return true, nil - } - podSlices, err := tc.getPodSlices(tfjob, replicas[tfv1.TFReplicaTypeWorker].Replicas) - if err != nil { - return false, err - } - for index, podSlice := range podSlices { - if len(podSlice) == 1 { - pod := podSlice[0] - exitCode := getContainerExitCode(pod) - if index == 0 && exitCode == 0 && pod.Status.Phase == v1.PodSucceeded { - worker0Completed = true - } - } - } - return worker0Completed, nil -} diff --git a/pkg/controller.v1/tensorflow/pod_test.go b/pkg/controller.v1/tensorflow/pod_test.go deleted file mode 100644 index 6c3c67d8d7..0000000000 --- a/pkg/controller.v1/tensorflow/pod_test.go +++ /dev/null @@ -1,823 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package controller provides a Kubernetes controller for a TFJob resource. -package tensorflow - -import ( - "fmt" - "os" - "reflect" - "testing" - - v1 "k8s.io/api/core/v1" - kubeclientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/record" - batchv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "github.com/kubeflow/common/pkg/controller.v1/common" - "github.com/kubeflow/common/pkg/controller.v1/control" - "github.com/kubeflow/common/pkg/controller.v1/expectation" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/tf-operator/pkg/common/util/v1/testutil" -) - -func TestAddPod(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - var key string - syncChan := make(chan string) - ctr.syncHandler = func(tfJobKey string) (bool, error) { - key = tfJobKey - <-syncChan - return true, nil - } - - tfJob := testutil.NewTFJob(1, 0) - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - pod := testutil.NewPod(tfJob, testutil.LabelWorker, 0) - ctr.AddPod(pod) - - syncChan <- "sync" - if key != testutil.GetKey(tfJob, t) { - t.Errorf("Failed to enqueue the TFJob %s: expected %s, got %s", tfJob.Name, testutil.GetKey(tfJob, t), key) - } - close(stopCh) -} - -func TestExpectation(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - - ctr.PodControl = &control.FakePodControl{} - tfJob := testutil.NewTFJob(2, 1) - - var err error - if err = ctr.createNewPod(tfJob, "worker", "0", - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker], - false, tfJob.Spec.TFReplicaSpecs); err != nil { - t.Errorf("Expected get nil, got error %v", err) - } - - tfjobKey, err := KeyFunc(tfJob) - if err != nil { - t.Errorf("Expected nil, got error %v", err) - } - expectationPodsKey := expectation.GenExpectationPodsKey(tfjobKey, "worker") - e, found, err := ctr.Expectations.GetExpectations(expectationPodsKey) - if err != nil { - t.Errorf("Expected nil, got error %v", err) - } - if !found { - t.Errorf("Expected to get the corresponding expectation") - } - if add, del := e.GetExpectations(); add != 1 || del != 0 { - t.Errorf("Expected get 1 add and 0 del, got %d add and %d del", add, del) - } -} - -func TestExpectationWithError(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - - ctr.PodControl = &control.FakePodControl{} - tfJob := testutil.NewTFJob(2, 1) - - // Fake an error. - ctr.PodControl.(*control.FakePodControl).Err = fmt.Errorf("Fake") - - var err error - if err = ctr.createNewPod(tfJob, "worker", "0", - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker], - false, tfJob.Spec.TFReplicaSpecs); err == nil { - t.Errorf("Expected error, got nil") - } - - tfjobKey, err := KeyFunc(tfJob) - if err != nil { - t.Errorf("Expected nil, got error %v", err) - } - expectationPodsKey := expectation.GenExpectationPodsKey(tfjobKey, "worker") - e, found, err := ctr.Expectations.GetExpectations(expectationPodsKey) - if err != nil { - t.Errorf("Expected nil, got error %v", err) - } - if !found { - t.Errorf("Expected to get the corresponding expectation") - } - if add, del := e.GetExpectations(); add != 0 || del != 0 { - t.Errorf("Expected get 0 add and 0 del, got %d add and %d del", add, del) - } -} - -func TestClusterSpec(t *testing.T) { - type tc struct { - tfJob *tfv1.TFJob - rt string - index string - customClusterDomain string - expectedClusterSpec string - } - testCase := []tc{ - tc{ - tfJob: testutil.NewTFJobWithNamespace(1, 0, "ns0"), - rt: "worker", - index: "0", - customClusterDomain: "", - expectedClusterSpec: "", - }, - tc{ - tfJob: testutil.NewTFJobWithNamespace(1, 0, "ns1"), - rt: "worker", - index: "0", - customClusterDomain: "tf.training.com", - expectedClusterSpec: "", - }, - tc{ - tfJob: testutil.NewTFJobWithNamespace(1, 1, "ns2"), - rt: "worker", - index: "0", - customClusterDomain: "tf.training.org", - expectedClusterSpec: `{"cluster":{"ps":["` + testutil.TestTFJobName + - `-ps-0.ns2.svc.tf.training.org:2222"],"worker":["` + testutil.TestTFJobName + - `-worker-0.ns2.svc.tf.training.org:2222"]},"task":{"type":"worker","index":0},"environment":"cloud"}`, - }, - tc{ - tfJob: testutil.NewTFJobWithEvaluatorAndNamespace(1, 1, 1, "ns3"), - rt: "worker", - index: "0", - customClusterDomain: "tf.training.io", - expectedClusterSpec: `{"cluster":{"evaluator":["` + testutil.TestTFJobName + - `-evaluator-0.ns3.svc.tf.training.io:2222"],"ps":["` + testutil.TestTFJobName + - `-ps-0.ns3.svc.tf.training.io:2222"],"worker":["` + testutil.TestTFJobName + - `-worker-0.ns3.svc.tf.training.io:2222"]},"task":{"type":"worker","index":0},"environment":"cloud"}`, - }, - tc{ - tfJob: testutil.NewTFJobWithEvaluatorAndNamespace(1, 1, 1, "ns3"), - rt: "worker", - index: "0", - customClusterDomain: "", - expectedClusterSpec: `{"cluster":{"evaluator":["` + testutil.TestTFJobName + - `-evaluator-0.ns3.svc:2222"],"ps":["` + testutil.TestTFJobName + - `-ps-0.ns3.svc:2222"],"worker":["` + testutil.TestTFJobName + - `-worker-0.ns3.svc:2222"]},"task":{"type":"worker","index":0},"environment":"cloud"}`, - }, - } - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - - for _, c := range testCase { - os.Setenv(EnvCustomClusterDomain, c.customClusterDomain) - - podTemplate := c.tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].Template.DeepCopy() - - // Set name for the template. - podTemplate.Name = common.GenGeneralName(c.tfJob.GetName(), c.rt, c.index) - - if podTemplate.Labels == nil { - podTemplate.Labels = make(map[string]string) - } - - // Set type and index for the worker. - labels := ctr.GenLabels(c.tfJob.GetName()) - labels[commonv1.ReplicaTypeLabel] = c.rt - labels[commonv1.ReplicaIndexLabel] = c.index - - for key, value := range labels { - podTemplate.Labels[key] = value - } - - if err := ctr.SetClusterSpec(c.tfJob, podTemplate, c.rt, c.index); err != nil { - t.Errorf("Failed to set cluster spec: %v", err) - } - // The expected cluster spec is nil, which means that we should not set TF_CONFIG. - if c.expectedClusterSpec == "" { - if len(podTemplate.Spec.Containers[0].Env) != 0 { - t.Errorf("Expected empty TF_CONFIG, got %s", - podTemplate.Spec.Containers[0].Env[0].Value) - } - } else { - actual := podTemplate.Spec.Containers[0].Env[0].Value - if c.expectedClusterSpec != actual { - t.Errorf("Expected %s, got %s", c.expectedClusterSpec, actual) - } - } - } -} - -func TestIsDistributed(t *testing.T) { - type tc struct { - tfJob *tfv1.TFJob - expected bool - } - testCase := []tc{ - { - tfJob: testutil.NewTFJob(1, 0), - expected: false, - }, - { - tfJob: testutil.NewTFJob(1, 1), - expected: true, - }, - { - tfJob: testutil.NewTFJob(0, 1), - expected: false, - }, - { - tfJob: testutil.NewTFJobWithChief(1, 0), - expected: true, - }, - } - for _, c := range testCase { - actual := isDistributed(c.tfJob) - if actual != c.expected { - t.Errorf("Expected %t, got %t", c.expected, actual) - } - } -} - -func TestRestartPolicy(t *testing.T) { - type tc struct { - tfJob *tfv1.TFJob - expectedRestartPolicy v1.RestartPolicy - expectedType commonv1.ReplicaType - } - testCase := []tc{ - func() tc { - tfJob := testutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyExitCode - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy - return tc{ - tfJob: tfJob, - expectedRestartPolicy: v1.RestartPolicyNever, - expectedType: tfv1.TFReplicaTypeWorker, - } - }(), - func() tc { - tfJob := testutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyNever - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy - return tc{ - tfJob: tfJob, - expectedRestartPolicy: v1.RestartPolicyNever, - expectedType: tfv1.TFReplicaTypeWorker, - } - }(), - func() tc { - tfJob := testutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyAlways - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy - return tc{ - tfJob: tfJob, - expectedRestartPolicy: v1.RestartPolicyAlways, - expectedType: tfv1.TFReplicaTypeWorker, - } - }(), - func() tc { - tfJob := testutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyOnFailure - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy - return tc{ - tfJob: tfJob, - expectedRestartPolicy: v1.RestartPolicyOnFailure, - expectedType: tfv1.TFReplicaTypeWorker, - } - }(), - } - for _, c := range testCase { - spec := c.tfJob.Spec.TFReplicaSpecs[c.expectedType] - podTemplate := spec.Template - setRestartPolicy(&podTemplate, spec) - if podTemplate.Spec.RestartPolicy != c.expectedRestartPolicy { - t.Errorf("Expected %s, got %s", c.expectedRestartPolicy, podTemplate.Spec.RestartPolicy) - } - } -} - -func TestExitCode(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - tfJob := testutil.NewTFJob(1, 0) - tfJob.Spec.TFReplicaSpecs[tfv1.TFReplicaTypeWorker].RestartPolicy = commonv1.RestartPolicyExitCode - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - pod := testutil.NewPod(tfJob, testutil.LabelWorker, 0) - pod.Status.Phase = v1.PodFailed - pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{}) - pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, v1.ContainerStatus{ - Name: tfv1.DefaultContainerName, - State: v1.ContainerState{ - Terminated: &v1.ContainerStateTerminated{ - ExitCode: 130, - }, - }, - }) - - if err := podIndexer.Add(pod); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - _ = ctr.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) - // _, err = ctr.syncTFJob(testutil.GetKey(tfJob, t)) - // if err != nil { - // t.Errorf("%s: unexpected error when syncing jobs %v", tfJob.Name, err) - // } - - found := false - for _, deletedPodName := range fakePodControl.DeletePodName { - if deletedPodName == pod.Name { - found = true - } - } - if !found { - t.Errorf("Failed to delete pod %s", pod.Name) - } - close(stopCh) -} - -// Test scaling down number of workers while training is running -func TestScaleDown(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - ctr.Recorder = &record.FakeRecorder{} - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - tfJob := testutil.NewTFJob(2, 0) - tfJob.SelfLink = "/api/v1/namespaces/default/tfjob/test-tfjob" - tfJob.Spec.EnableDynamicWorker = true - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - pod0 := testutil.NewPod(tfJob, testutil.LabelWorker, 0) - pod1 := testutil.NewPod(tfJob, testutil.LabelWorker, 1) - pod2 := testutil.NewPod(tfJob, testutil.LabelWorker, 2) - - if err := podIndexer.Add(pod0); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - if err := podIndexer.Add(pod1); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - if err := podIndexer.Add(pod2); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - - _ = ctr.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) - // _, err = ctr.syncTFJob(testutil.GetKey(tfJob, t)) - // if err != nil { - // t.Errorf("%s: unexpected error when syncing jobs %v", tfJob.Name, err) - // } - - expectedDeletePods := []string{"worker-2"} - if !reflect.DeepEqual(expectedDeletePods, fakePodControl.DeletePodName) { - t.Errorf("Scale down workers test failed") - } - close(stopCh) -} - -// Test scaling up number of workers while training is running -func TestScaleUp(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - tfJob := testutil.NewTFJob(3, 0) - tfJob.Spec.EnableDynamicWorker = true - unstructured, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - pod0 := testutil.NewPod(tfJob, testutil.LabelWorker, 0) - - if err := podIndexer.Add(pod0); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - - _ = ctr.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) - // _, err = ctr.syncTFJob(testutil.GetKey(tfJob, t)) - // if err != nil { - // t.Errorf("%s: unexpected error when syncing jobs %v", tfJob.Name, err) - // } - - if !(len(fakePodControl.Templates) == 2 && fakePodControl.Templates[0].Name == "test-tfjob-worker-1" && fakePodControl.Templates[1].Name == "test-tfjob-worker-2") { - t.Error("Scale up workers test failed") - } - - close(stopCh) -} - -func TestIsWorker0Completed(t *testing.T) { - newInt32 := func(in int32) *int32 { - return &in - } - tests := []struct { - // worker failed, succeeded, running num - workers [3]int32 - tfJob *tfv1.TFJob - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec - expected bool - expectedErr bool - }{ - { - workers: [3]int32{0, 0, 1}, - tfJob: testutil.NewTFJobV2(1, 1, 0, 0, 0), - expected: false, - expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - tfv1.TFReplicaTypeWorker: { - Replicas: newInt32(1), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - tfv1.TFReplicaTypePS: { - Replicas: newInt32(1), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - }, - }, - { - workers: [3]int32{0, 1, 0}, - tfJob: testutil.NewTFJobV2(1, 0, 0, 0, 0), - expected: true, - expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - tfv1.TFReplicaTypeWorker: { - Replicas: newInt32(1), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - }, - }, - { - workers: [3]int32{0, 0, 0}, - tfJob: testutil.NewTFJobV2(0, 0, 1, 0, 0), - expected: true, - expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - tfv1.TFReplicaTypeMaster: { - Replicas: newInt32(1), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - }, - }, - { - workers: [3]int32{0, 0, 0}, - tfJob: testutil.NewTFJobV2(0, 0, 0, 1, 0), - expected: true, - expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - tfv1.TFReplicaTypeChief: { - Replicas: newInt32(1), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - }, - }, - { - workers: [3]int32{1, 1, 0}, - tfJob: testutil.NewTFJobV2(2, 0, 0, 0, 0), - expected: true, - expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - tfv1.TFReplicaTypeWorker: { - Replicas: newInt32(2), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - }, - }, - { - workers: [3]int32{1, 0, 1}, - tfJob: testutil.NewTFJobV2(2, 0, 0, 0, 0), - expected: false, - expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - tfv1.TFReplicaTypeWorker: { - Replicas: newInt32(2), - Template: testutil.NewTFReplicaSpecTemplate(), - }, - }, - }, - } - for _, tt := range tests { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - - // only related to worker status - initializeReplicaStatuses(&tt.tfJob.Status, tfv1.TFReplicaTypeWorker) - // set status and add pod to indexer - setStatusForTest(tt.tfJob, tfv1.TFReplicaTypeWorker, tt.workers[0], tt.workers[1], tt.workers[2], false, true, podIndexer, t) - - got, err := ctr.IsWorker0Completed(tt.tfJob, tt.replicas) - if (err != nil) != tt.expectedErr { - t.Errorf("IsWorker0Completed() error = %v, wantErr %v", err, tt.expectedErr) - return - } - if got != tt.expected { - t.Errorf("IsWorker0Completed() got = %v, want %v", got, tt.expected) - } - } -} diff --git a/pkg/controller.v1/tensorflow/status.go b/pkg/controller.v1/tensorflow/status.go deleted file mode 100644 index 4d078f7679..0000000000 --- a/pkg/controller.v1/tensorflow/status.go +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package controller provides a Kubernetes controller for a TFJob resource. -package tensorflow - -import ( - "context" - "fmt" - "time" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - commonutil "github.com/kubeflow/common/pkg/util" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" -) - -const ( - // tfJobCreatedReason is added in a tfjob when it is created. - tfJobCreatedReason = "TFJobCreated" - // tfJobSucceededReason is added in a tfjob when it is succeeded. - tfJobSucceededReason = "TFJobSucceeded" - // tfJobRunningReason is added in a tfjob when it is running. - tfJobRunningReason = "TFJobRunning" - // tfJobFailedReason is added in a tfjob when it is failed. - tfJobFailedReason = "TFJobFailed" - // tfJobRestarting is added in a tfjob when it is restarting. - tfJobRestartingReason = "TFJobRestarting" -) - -var ( - tfJobsSuccessCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "tf_operator_jobs_successful_total", - Help: "Counts number of TF jobs successful", - }, - []string{"job_namespace"}, - ) - tfJobsFailureCount = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "tf_operator_jobs_failed_total", - Help: "Counts number of TF jobs failed", - }, - []string{"job_namespace"}, - ) -) - -func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error { - tfJob, ok := job.(*tfv1.TFJob) - if !ok { - return fmt.Errorf("%v is not a type of TFJob", tfJob) - } - - tfJobKey, err := KeyFunc(tfJob) - if err != nil { - utilruntime.HandleError(fmt.Errorf("couldn't get key for tfjob object %#v: %v", tfJob, err)) - return err - } - - logger := commonutil.LoggerForJob(tfJob) - - worker0Completed, err := tc.IsWorker0Completed(tfJob, replicas) - if err != nil { - logger.Warnf("check if worker 0 completed error %v", err) - return err - } - - // Set StartTime. - if jobStatus.StartTime == nil { - now := metav1.Now() - jobStatus.StartTime = &now - // enqueue a sync to check if job past ActiveDeadlineSeconds - if tfJob.Spec.RunPolicy.ActiveDeadlineSeconds != nil { - logger.Infof("Job with ActiveDeadlineSeconds will sync after %d seconds", *tfJob.Spec.RunPolicy.ActiveDeadlineSeconds) - tc.WorkQueue.AddAfter(tfJobKey, time.Duration(*tfJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second) - } - } - // iterate the replica spec based on this order - allTypes := []commonv1.ReplicaType{ - tfv1.TFReplicaTypeChief, - tfv1.TFReplicaTypeEval, - tfv1.TFReplicaTypeMaster, - tfv1.TFReplicaTypePS, - tfv1.TFReplicaTypeWorker, - } - for _, rtype := range allTypes { - if replicas[rtype] == nil { - continue - } - spec := replicas[rtype] - status := jobStatus.ReplicaStatuses[rtype] - - // Expect to have `replicas - succeeded` pods alive. - succeeded := status.Succeeded - expected := *(spec.Replicas) - succeeded - running := status.Active - failed := status.Failed - - logger.Infof("TFJob=%s/%s, ReplicaType=%s expected=%d, running=%d, failed=%d", - tfJob.Namespace, tfJob.Name, rtype, expected, running, failed) - - // If the TFJob contains Chief or Master spec, then we will update the status - // according to the Chief/Master spec. - if ContainChieforMasterSpec(tfJob.Spec.TFReplicaSpecs) { - if tfv1.IsChieforMaster(rtype) { - if running > 0 { - msg := fmt.Sprintf("TFJob %s/%s is running.", - tfJob.Namespace, tfJob.Name) - err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobRunning, tfJobRunningReason, msg) - if err != nil { - commonutil.LoggerForJob(tfJob).Infof( - "Append tfjob condition error: %v", err) - return err - } - } - if expected == 0 { - msg := fmt.Sprintf("TFJob %s/%s successfully completed.", - tfJob.Namespace, tfJob.Name) - tc.Recorder.Event(tfJob, corev1.EventTypeNormal, tfJobSucceededReason, msg) - if jobStatus.CompletionTime == nil { - now := metav1.Now() - jobStatus.CompletionTime = &now - } - err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobSucceeded, tfJobSucceededReason, msg) - if err != nil { - commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) - return err - } - tfJobsSuccessCount.WithLabelValues(tfJob.Namespace).Inc() - } - } - } else { - if rtype == tfv1.TFReplicaTypeWorker { - // Leave a succeeded condition for the following two cases: - // 1. If default success policy is used and worker 0 has completed. - // 2. If `SuccessPolicyAllWorkers` success policy is used and all workers are succeeded. - if expected == 0 || (worker0Completed && *tfJob.Spec.SuccessPolicy != tfv1.SuccessPolicyAllWorkers) { - msg := fmt.Sprintf("TFJob %s/%s successfully completed.", - tfJob.Namespace, tfJob.Name) - tc.Recorder.Event(tfJob, corev1.EventTypeNormal, tfJobSucceededReason, msg) - if jobStatus.CompletionTime == nil { - now := metav1.Now() - jobStatus.CompletionTime = &now - } - err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobSucceeded, tfJobSucceededReason, msg) - if err != nil { - commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) - return err - } - tfJobsSuccessCount.WithLabelValues(tfJob.Namespace).Inc() - } else if running > 0 { - // Some workers are still running, leave a running condition. - msg := fmt.Sprintf("TFJob %s/%s is running.", - tfJob.Namespace, tfJob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, tfJobRunningReason, msg) - if err != nil { - commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) - return err - } - } - } - } - - if failed > 0 { - restart := false - for _, condition := range jobStatus.Conditions { - if condition.Type == commonv1.JobRestarting { - restart = true - } - } - - if restart { - // job is restarting, no need to set it failed - // we know it because we update the status condition when reconciling the replicas - tfJobsFailureCount.WithLabelValues(tfJob.Namespace).Inc() - } else { - msg := fmt.Sprintf("TFJob %s/%s has failed because %d %s replica(s) failed.", - tfJob.Namespace, tfJob.Name, failed, rtype) - tc.Recorder.Event(tfJob, corev1.EventTypeNormal, tfJobFailedReason, msg) - if jobStatus.CompletionTime == nil { - now := metav1.Now() - jobStatus.CompletionTime = &now - } - err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobFailed, tfJobFailedReason, msg) - if err != nil { - commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) - return err - } - tfJobsFailureCount.WithLabelValues(tfJob.Namespace).Inc() - } - } - } - // we assign the jobStatus to the tfJob.Status for testing purpose - // it won't effect the main reconcile logic - // because we already use oldStatus := jobStatus.DeepCopy() to record the oldStatus - // and use !reflect.DeepEqual(*oldStatus, jobStatus) to decide whether to update the tfJob or not - tfJob.Status = *jobStatus.DeepCopy() - - return nil -} - -// UpdateJobStatusInApiServer updates the status of the given TFJob. -func (tc *TFController) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { - tfJob, ok := job.(*tfv1.TFJob) - if !ok { - return fmt.Errorf("%v is not a type of TFJob", tfJob) - } - - startTime := time.Now() - logger := commonutil.LoggerForJob(tfJob) - defer func() { - logger.Infof("Finished updating TFJobs Status %q (%v)", - tfJob.Name, time.Since(startTime)) - }() - - tfJob = tfJob.DeepCopy() - tfJob.Status = *jobStatus.DeepCopy() - - _, err := tc.tfJobClientSet.KubeflowV1().TFJobs(tfJob.Namespace).UpdateStatus(context.TODO(), tfJob, metav1.UpdateOptions{}) - return err -} - -// initializeReplicaStatuses initializes the ReplicaStatuses for replica. -func initializeReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType) { - if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = make(map[commonv1.ReplicaType]*commonv1.ReplicaStatus) - } - - jobStatus.ReplicaStatuses[rtype] = &commonv1.ReplicaStatus{} -} - -// updateJobReplicaStatuses updates the JobReplicaStatuses according to the pod. -func updateJobReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType, pod *corev1.Pod) { - switch pod.Status.Phase { - case corev1.PodRunning: - jobStatus.ReplicaStatuses[rtype].Active++ - case corev1.PodSucceeded: - jobStatus.ReplicaStatuses[rtype].Succeeded++ - case corev1.PodFailed: - jobStatus.ReplicaStatuses[rtype].Failed++ - } -} - -func isSucceeded(status commonv1.JobStatus) bool { - return hasCondition(status, commonv1.JobSucceeded) -} - -func isFailed(status commonv1.JobStatus) bool { - return hasCondition(status, commonv1.JobFailed) -} - -func hasCondition(status commonv1.JobStatus, condType commonv1.JobConditionType) bool { - for _, condition := range status.Conditions { - if condition.Type == condType && condition.Status == v1.ConditionTrue { - return true - } - } - return false -} diff --git a/pkg/controller.v1/tensorflow/status_test.go b/pkg/controller.v1/tensorflow/status_test.go deleted file mode 100644 index 29d2f65bce..0000000000 --- a/pkg/controller.v1/tensorflow/status_test.go +++ /dev/null @@ -1,592 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package controller provides a Kubernetes controller for a TFJob resource. -package tensorflow - -import ( - "fmt" - "testing" - - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" - kubeclientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/record" - - batchv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" - - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - "github.com/kubeflow/common/pkg/controller.v1/control" - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - tfjobclientset "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" - "github.com/kubeflow/tf-operator/pkg/common/util/v1/testutil" -) - -func TestFailed(t *testing.T) { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, _, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - - tfJob := testutil.NewTFJob(3, 0) - initializeReplicaStatuses(&tfJob.Status, tfv1.TFReplicaTypeWorker) - pod := testutil.NewBasePod("pod", tfJob) - pod.Status.Phase = v1.PodFailed - - updateJobReplicaStatuses(&tfJob.Status, tfv1.TFReplicaTypeWorker, pod) - if tfJob.Status.ReplicaStatuses[commonv1.ReplicaType(tfv1.TFReplicaTypeWorker)].Failed != 1 { - t.Errorf("Failed to set the failed to 1") - } - - err := ctr.UpdateJobStatus(tfJob, tfJob.Spec.TFReplicaSpecs, &tfJob.Status) - if err != nil { - t.Errorf("Expected error %v to be nil", err) - } - found := false - for _, condition := range tfJob.Status.Conditions { - if condition.Type == commonv1.JobFailed { - found = true - } - } - if !found { - t.Errorf("Failed condition is not found") - } -} - -func TestStatus(t *testing.T) { - type testCase struct { - description string - tfJob *tfv1.TFJob - - expectedFailedPS int32 - expectedSucceededPS int32 - expectedActivePS int32 - - expectedFailedWorker int32 - expectedSucceededWorker int32 - expectedActiveWorker int32 - - expectedFailedChief int32 - expectedSucceededChief int32 - expectedActiveChief int32 - - restart bool - worker0Completed bool - - expectedType commonv1.JobConditionType - } - - testCases := []testCase{ - testCase{ - description: "Chief worker is succeeded", - tfJob: testutil.NewTFJobWithChief(1, 0), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 1, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 1, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobSucceeded, - }, - testCase{ - description: "Chief worker is running", - tfJob: testutil.NewTFJobWithChief(1, 0), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 0, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 1, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobRunning, - }, - testCase{ - description: "Chief worker is failed", - tfJob: testutil.NewTFJobWithChief(1, 0), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 0, - expectedActiveWorker: 0, - expectedFailedChief: 1, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "(No chief worker) Worker is failed", - tfJob: testutil.NewTFJob(1, 0), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 1, - expectedSucceededWorker: 0, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "(No chief worker) Worker is succeeded", - tfJob: testutil.NewTFJob(1, 0), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 1, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobSucceeded, - }, - testCase{ - description: "(No chief worker) Worker is running", - tfJob: testutil.NewTFJob(1, 0), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 0, - expectedActiveWorker: 1, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobRunning, - }, - testCase{ - description: "(No chief worker) 2 workers are succeeded, 2 workers are active", - tfJob: testutil.NewTFJob(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 0, - expectedSucceededWorker: 2, - expectedActiveWorker: 2, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobRunning, - }, - testCase{ - description: "(No chief worker) 2 workers are running, 2 workers are failed", - tfJob: testutil.NewTFJob(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 2, - expectedSucceededWorker: 0, - expectedActiveWorker: 2, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "(No chief worker) 2 workers are succeeded, 2 workers are failed", - tfJob: testutil.NewTFJob(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 2, - expectedSucceededWorker: 2, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "(No chief worker) worker-0 are succeeded, 3 workers are active", - tfJob: testutil.NewTFJob(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 0, - expectedSucceededWorker: 1, - expectedActiveWorker: 3, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: true, - expectedType: commonv1.JobSucceeded, - }, - testCase{ - description: "(No chief worker, successPolicy: AllWorkers) worker-0 are succeeded, 3 workers are active", - tfJob: testutil.NewTFJobWithSuccessPolicy(4, 0, tfv1.SuccessPolicyAllWorkers), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 1, - expectedActiveWorker: 3, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: true, - expectedType: commonv1.JobRunning, - }, - testCase{ - description: "(No chief worker, successPolicy: AllWorkers) 4 workers are succeeded", - tfJob: testutil.NewTFJobWithSuccessPolicy(4, 0, tfv1.SuccessPolicyAllWorkers), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 0, - expectedSucceededWorker: 4, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: true, - expectedType: commonv1.JobSucceeded, - }, - testCase{ - description: "(No chief worker, successPolicy: AllWorkers) worker-0 is succeeded, 2 workers are running, 1 worker is failed", - tfJob: testutil.NewTFJobWithSuccessPolicy(4, 0, tfv1.SuccessPolicyAllWorkers), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 0, - expectedFailedWorker: 1, - expectedSucceededWorker: 1, - expectedActiveWorker: 2, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: true, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "Chief is running, workers are failed", - tfJob: testutil.NewTFJobWithChief(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 4, - expectedSucceededWorker: 0, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 1, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobRunning, - }, - testCase{ - description: "Chief is running, workers are succeeded", - tfJob: testutil.NewTFJobWithChief(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 0, - expectedSucceededWorker: 4, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 1, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobRunning, - }, - testCase{ - description: "Chief is running, a PS is failed", - tfJob: testutil.NewTFJobWithChief(4, 2), - expectedFailedPS: 1, - expectedSucceededPS: 0, - expectedActivePS: 1, - expectedFailedWorker: 0, - expectedSucceededWorker: 4, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 0, - expectedActiveChief: 1, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "Chief is failed, workers are succeeded", - tfJob: testutil.NewTFJobWithChief(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 0, - expectedSucceededWorker: 4, - expectedActiveWorker: 0, - expectedFailedChief: 1, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobFailed, - }, - testCase{ - description: "Chief is succeeded, workers are failed", - tfJob: testutil.NewTFJobWithChief(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 4, - expectedSucceededWorker: 0, - expectedActiveWorker: 0, - expectedFailedChief: 0, - expectedSucceededChief: 1, - expectedActiveChief: 0, - restart: false, - worker0Completed: false, - expectedType: commonv1.JobSucceeded, - }, - testCase{ - description: "Chief is failed and restarting", - tfJob: testutil.NewTFJobWithChief(4, 2), - expectedFailedPS: 0, - expectedSucceededPS: 0, - expectedActivePS: 2, - expectedFailedWorker: 4, - expectedSucceededWorker: 0, - expectedActiveWorker: 0, - expectedFailedChief: 1, - expectedSucceededChief: 0, - expectedActiveChief: 0, - restart: true, - worker0Completed: false, - expectedType: commonv1.JobRestarting, - }, - } - - for i, c := range testCases { - // Prepare the clientset and controller for the test. - kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &v1.SchemeGroupVersion, - }, - }, - ) - - // Prepare the volcano clientset and controller for the test. - volcanoClientSet := volcanoclient.NewForConfigOrDie(&rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &batchv1beta1.SchemeGroupVersion, - }, - }, - ) - - config := &rest.Config{ - Host: "", - ContentConfig: rest.ContentConfig{ - GroupVersion: &tfv1.GroupVersion, - }, - } - tfJobClientSet := tfjobclientset.NewForConfigOrDie(config) - ctr, kubeInformerFactory, _ := newTFController(config, kubeClientSet, - volcanoClientSet, tfJobClientSet, 0, options.ServerOption{}) - fakePodControl := &control.FakePodControl{} - ctr.PodControl = fakePodControl - ctr.Recorder = &record.FakeRecorder{} - ctr.tfJobInformerSynced = testutil.AlwaysReady - ctr.PodInformerSynced = testutil.AlwaysReady - ctr.ServiceInformerSynced = testutil.AlwaysReady - tfJobIndexer := ctr.tfJobInformer.GetIndexer() - podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() - - stopCh := make(chan struct{}) - run := func(<-chan struct{}) { - if err := ctr.Run(testutil.ThreadCount, stopCh); err != nil { - t.Errorf("Failed to run the controller: %v", err) - } - } - go run(stopCh) - - unstructured, err := testutil.ConvertTFJobToUnstructured(c.tfJob) - if err != nil { - t.Errorf("Failed to convert the TFJob to Unstructured: %v", err) - } - - if err := tfJobIndexer.Add(unstructured); err != nil { - t.Errorf("Failed to add tfjob to tfJobIndexer: %v", err) - } - - initializeReplicaStatuses(&c.tfJob.Status, tfv1.TFReplicaTypeWorker) - initializeReplicaStatuses(&c.tfJob.Status, tfv1.TFReplicaTypeChief) - initializeReplicaStatuses(&c.tfJob.Status, tfv1.TFReplicaTypePS) - - setStatusForTest(c.tfJob, tfv1.TFReplicaTypePS, c.expectedFailedPS, c.expectedSucceededPS, c.expectedActivePS, c.restart, c.worker0Completed, podIndexer, t) - setStatusForTest(c.tfJob, tfv1.TFReplicaTypeWorker, c.expectedFailedWorker, c.expectedSucceededWorker, c.expectedActiveWorker, c.restart, c.worker0Completed, podIndexer, t) - setStatusForTest(c.tfJob, tfv1.TFReplicaTypeChief, c.expectedFailedChief, c.expectedSucceededChief, c.expectedActiveChief, c.restart, c.worker0Completed, podIndexer, t) - - // err = ctr.UpdateJobStatus(c.tfJob, c.tfJob.Spec.TFReplicaSpecs, &c.tfJob.Status) - // if err != nil { - // t.Errorf("%s: Expected error %v to be nil", c.description, err) - // } - _ = ctr.ReconcileJobs(c.tfJob, c.tfJob.Spec.TFReplicaSpecs, c.tfJob.Status, &c.tfJob.Spec.RunPolicy) - - // Test filterOutCondition - filterOutConditionTest(c.tfJob.Status, t) - - found := false - for _, condition := range c.tfJob.Status.Conditions { - if condition.Type == c.expectedType { - found = true - } - } - if !found { - t.Errorf("Case[%d]%s: Condition %s is not found", i, c.description, c.expectedType) - } - } -} - -func setStatusForTest(tfJob *tfv1.TFJob, rtype commonv1.ReplicaType, failed, succeeded, active int32, restart bool, worker0Completed bool, podIndexer cache.Indexer, t *testing.T) { - if restart == true { - tfJob.Spec.TFReplicaSpecs[rtype].RestartPolicy = commonv1.RestartPolicyExitCode - } - - var typ string - switch rtype { - case tfv1.TFReplicaTypeWorker: - typ = testutil.LabelWorker - case tfv1.TFReplicaTypePS: - typ = testutil.LabelPS - case tfv1.TFReplicaTypeChief: - typ = testutil.LabelChief - default: - fmt.Println("wrong type") - } - - var i int32 - index := 0 - for i = 0; i < succeeded; i++ { - pod := testutil.NewPod(tfJob, typ, index) - pod.Status.Phase = v1.PodSucceeded - if worker0Completed == true && rtype == tfv1.TFReplicaTypeWorker && index == 0 { - pod.Status.ContainerStatuses = []v1.ContainerStatus{ - { - Name: tfv1.DefaultContainerName, - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ - ExitCode: int32(0), // exit with 0 - }, - }, - }, - } - } - if err := podIndexer.Add(pod); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - updateJobReplicaStatuses(&tfJob.Status, rtype, pod) - - index++ - } - for i = 0; i < failed; i++ { - pod := testutil.NewPod(tfJob, typ, index) - pod.Status.Phase = v1.PodFailed - if restart == true { - if pod.Status.ContainerStatuses == nil { - pod.Status.ContainerStatuses = []v1.ContainerStatus{ - { - Name: tfv1.DefaultContainerName, - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ - ExitCode: int32(130), // 130 is a retryable code - }, - }, - }, - } - } - } - if err := podIndexer.Add(pod); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - updateJobReplicaStatuses(&tfJob.Status, rtype, pod) - index++ - } - for i = 0; i < active; i++ { - pod := testutil.NewPod(tfJob, typ, index) - pod.Status.Phase = v1.PodRunning - if err := podIndexer.Add(pod); err != nil { - t.Errorf("%s: unexpected error when adding pod %v", tfJob.Name, err) - } - updateJobReplicaStatuses(&tfJob.Status, rtype, pod) - index++ - } -} - -func filterOutConditionTest(status commonv1.JobStatus, t *testing.T) { - flag := isFailed(status) || isSucceeded(status) - for _, condition := range status.Conditions { - if flag && condition.Type == commonv1.JobRunning && condition.Status == v1.ConditionTrue { - t.Error("Error condition status when succeeded or failed") - } - } -} diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index 2c8f93da1e..379f8bb407 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -21,55 +21,76 @@ import ( "strings" "time" - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/validation" - - "sigs.k8s.io/controller-runtime/pkg/event" - - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/handler" - "sigs.k8s.io/controller-runtime/pkg/predicate" - "sigs.k8s.io/controller-runtime/pkg/source" - - train_util "github.com/kubeflow/common/pkg/util/train" - + "github.com/go-logr/logr" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - commonutil "github.com/kubeflow/common/pkg/util" - - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime/schema" - - "github.com/kubeflow/common/pkg/controller.v1/expectation" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - kubeclientset "k8s.io/client-go/kubernetes" - "sigs.k8s.io/controller-runtime/pkg/manager" - volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" - - "k8s.io/apimachinery/pkg/types" - "github.com/kubeflow/common/pkg/controller.v1/common" "github.com/kubeflow/common/pkg/controller.v1/control" + "github.com/kubeflow/common/pkg/controller.v1/expectation" + commonutil "github.com/kubeflow/common/pkg/util" + train_util "github.com/kubeflow/common/pkg/util/train" + tensorflowv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/validation" + trainingoperatorcommon "github.com/kubeflow/tf-operator/pkg/common" + "github.com/kubeflow/tf-operator/pkg/common/util" + "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "k8s.io/client-go/tools/record" - - "github.com/sirupsen/logrus" - "sigs.k8s.io/controller-runtime/pkg/log" - - "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - - tensorflowv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - trainingoperatorcommon "github.com/kubeflow/tf-operator/pkg/common" - "github.com/kubeflow/tf-operator/pkg/common/util" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/source" + volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" ) -var ( - defaultCleanPodPolicy = commonv1.CleanPodPolicyNone +const ( + // tfJobSucceededReason is added in a tfjob when it is succeeded. + tfJobSucceededReason = "TFJobSucceeded" + // tfJobRunningReason is added in a tfjob when it is running. + tfJobRunningReason = "TFJobRunning" + // tfJobFailedReason is added in a tfjob when it is failed. + tfJobFailedReason = "TFJobFailed" + // tfJobRestarting is added in a tfjob when it is restarting. + tfJobRestartingReason = "TFJobRestarting" + + FailedDeleteJobReason = "FailedDeleteJob" + SuccessfulDeleteJobReason = "SuccessfulDeleteJob" + + controllerName = "tfjob-controller" + + // labels for pods and servers. + tfReplicaTypeLabel = "replica-type" + tfReplicaIndexLabel = "replica-index" + // volcanoTaskSpecKey task spec key used in pod annotation when EnableGangScheduling is true + volcanoTaskSpecKey = "volcano.sh/task-spec" + + // gang scheduler name. + gangSchedulerName = "volcano" + // tfConfig is the environment variable name of TensorFlow cluster spec. + tfConfig = "TF_CONFIG" + // exitedWithCodeReason is the normal reason when the pod is exited because of the exit code. + exitedWithCodeReason = "ExitedWithCode" + // podTemplateRestartPolicyReason is the warning reason when the restart + // policy is set in pod template. + podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy" + // podTemplateSchedulerNameReason is the warning reason when other scheduler name is set + // in pod templates with gang-scheduling enabled + podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName" + // gangSchedulingPodGroupAnnotation is the annotation key used by batch schedulers + gangSchedulingPodGroupAnnotation = "scheduling.k8s.io/group-name" ) func NewReconciler(mgr manager.Manager) *TFJobReconciler { @@ -356,7 +377,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 return fmt.Errorf("%v is not a type of TFJob", tfJob) } - tfJobKey, err := KeyFunc(tfJob) + tfJobKey, err := common.KeyFunc(tfJob) if err != nil { utilruntime.HandleError(fmt.Errorf("couldn't get key for tfjob object %#v: %v", tfJob, err)) return err @@ -407,7 +428,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 // If the TFJob contains Chief or Master spec, then we will update the status // according to the Chief/Master spec. - if ContainChieforMasterSpec(tfJob.Spec.TFReplicaSpecs) { + if ContainsChiefOrMasterSpec(tfJob.Spec.TFReplicaSpecs) { if tensorflowv1.IsChieforMaster(rtype) { if running > 0 { msg := fmt.Sprintf("TFJob %s/%s is running.", @@ -574,28 +595,24 @@ func (r *TFJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.Po return nil } -// Same as (tc *TFController) GetDefaultContainerName(..) in controller.go func (r *TFJobReconciler) GetDefaultContainerName() string { return tensorflowv1.DefaultContainerName } -// Same as (tc *TFController) GetDefaultContainerPortName(..) in controller.go func (r *TFJobReconciler) GetDefaultContainerPortName() string { return tensorflowv1.DefaultPortName } -// Same as (tc *TFController) IsMasterRole(..) in controller.go func (r *TFJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool { - if ContainChieforMasterSpec(replicas) { + if ContainsChiefOrMasterSpec(replicas) { return rtype == tensorflowv1.TFReplicaTypeChief || rtype == tensorflowv1.TFReplicaTypeMaster } // else check if it is worker with index 0 return rtype == tensorflowv1.TFReplicaTypeWorker && index == 0 } -// Following are replicatef from TFController -// IsWorker0Completed return true if pod of worker0 succeeded and exited with 0 +// IsWorker0Completed returns true if pod of worker0 succeeded and exited with 0 func (r *TFJobReconciler) IsWorker0Completed(tfjob *tensorflowv1.TFJob, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) (bool, error) { worker0Completed := false _, ok := replicas[tensorflowv1.TFReplicaTypeWorker] @@ -746,7 +763,7 @@ func (r *TFJobReconciler) ReconcilePods( func (r *TFJobReconciler) createNewPod(tfjob *tfv1.TFJob, rt, index string, spec *commonv1.ReplicaSpec, masterRole bool, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { - tfjobKey, err := KeyFunc(tfjob) + tfjobKey, err := common.KeyFunc(tfjob) if err != nil { utilruntime.HandleError(fmt.Errorf("couldn't get key for tfjob object %#v: %v", tfjob, err)) return err diff --git a/pkg/controller.v1/tensorflow/util.go b/pkg/controller.v1/tensorflow/util.go index c7e86c1049..93aa781544 100644 --- a/pkg/controller.v1/tensorflow/util.go +++ b/pkg/controller.v1/tensorflow/util.go @@ -15,14 +15,9 @@ package tensorflow import ( - "fmt" - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" -) - -var ( - errPortNotFound = fmt.Errorf("failed to found the port") + corev1 "k8s.io/api/core/v1" ) // GetPortFromTFJob gets the port of tensorflow container. @@ -41,8 +36,8 @@ func GetPortFromTFJob(tfJob *tfv1.TFJob, rtype commonv1.ReplicaType) (int32, err return tfv1.DefaultPort, nil } -// ContainChieforMasterSpec returns true if the tfjob contains chief or master spec. -func ContainChieforMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool { +// ContainsChiefOrMasterSpec returns true if the tfjob contains chief or master spec. +func ContainsChiefOrMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool { if _, ok := replicas[tfv1.TFReplicaTypeChief]; ok { return true } else if _, ok := replicas[tfv1.TFReplicaTypeMaster]; ok { @@ -50,3 +45,74 @@ func ContainChieforMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.Replic } return false } + +// originally from pkg/controller.v1/tensorflow/pod.go (deleted) +func getContainerExitCode(pod *corev1.Pod) int32 { + var exitCode int32 = 0xbeef // magic number + for _, status := range pod.Status.ContainerStatuses { + state := status.State + if status.Name == tfv1.DefaultContainerName && state.Terminated != nil { + exitCode = state.Terminated.ExitCode + } + } + return exitCode +} + +// originally from pkg/controller.v1/tensorflow/pod.go (deleted) +func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *commonv1.ReplicaSpec) { + // This is necessary since restartPolicyExitCode is not supported in v1.PodTemplateSpec + if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever + } else { + podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy) + } +} + +// isDistributed returns if the TFJob is a distributed training job. +// Ref https://github.com/kubeflow/tf-operator/issues/1078. +// originally from pkg/controller.v1/tensorflow/pod.go (deleted) +func isDistributed(tfjob *tfv1.TFJob) bool { + replicas := tfjob.Spec.TFReplicaSpecs + distributionCount := 0 + allTypes := []commonv1.ReplicaType{ + tfv1.TFReplicaTypeChief, + tfv1.TFReplicaTypeEval, + tfv1.TFReplicaTypeMaster, + tfv1.TFReplicaTypePS, + tfv1.TFReplicaTypeWorker, + } + // Check if there is only one replica. + for _, typ := range allTypes { + if replicas[typ] != nil { + if replicas[typ].Replicas == nil { + distributionCount++ + } else { + distributionCount += int(*replicas[typ].Replicas) + } + } + } + return distributionCount != 1 +} + +// initializeReplicaStatuses initializes the ReplicaStatuses for replica. +// originally from pkg/controller.v1/tensorflow/status.go (deleted) +func initializeReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType) { + if jobStatus.ReplicaStatuses == nil { + jobStatus.ReplicaStatuses = make(map[commonv1.ReplicaType]*commonv1.ReplicaStatus) + } + + jobStatus.ReplicaStatuses[rtype] = &commonv1.ReplicaStatus{} +} + +// updateJobReplicaStatuses updates the JobReplicaStatuses according to the pod. +// originally from pkg/controller.v1/tensorflow/status.go (deleted) +func updateJobReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType, pod *corev1.Pod) { + switch pod.Status.Phase { + case corev1.PodRunning: + jobStatus.ReplicaStatuses[rtype].Active++ + case corev1.PodSucceeded: + jobStatus.ReplicaStatuses[rtype].Succeeded++ + case corev1.PodFailed: + jobStatus.ReplicaStatuses[rtype].Failed++ + } +} diff --git a/pkg/controller.v1/tensorflow/util_test.go b/pkg/controller.v1/tensorflow/util_test.go deleted file mode 100644 index 938ae7475b..0000000000 --- a/pkg/controller.v1/tensorflow/util_test.go +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package tensorflow - -import ( - "testing" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - - tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" - "github.com/kubeflow/tf-operator/pkg/common/util/v1/testutil" -) - -func TestGenOwnerReference(t *testing.T) { - testName := "test-tfjob" - testUID := types.UID("test-UID") - tfJob := &tfv1.TFJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: testName, - UID: testUID, - }, - } - - ref := testutil.GenOwnerReference(tfJob) - if ref.UID != testUID { - t.Errorf("Expected UID %s, got %s", testUID, ref.UID) - } - if ref.Name != testName { - t.Errorf("Expected Name %s, got %s", testName, ref.Name) - } - if ref.APIVersion != tfv1.GroupVersion.Version { - t.Errorf("Expected APIVersion %s, got %s", tfv1.GroupVersion.String(), ref.APIVersion) - } -} - -func TestGenLabels(t *testing.T) { - testKey := "test/key" - expctedKey := "test-key" - - labels := testutil.GenLabels(testKey) - jobNamelabel := testutil.JobNameLabel - - if labels[jobNamelabel] != expctedKey { - t.Errorf("Expected %s %s, got %s", jobNamelabel, expctedKey, jobNamelabel) - } - if labels[labelGroupName] != tfv1.GroupVersion.Group { - t.Errorf("Expected %s %s, got %s", labelGroupName, tfv1.GroupVersion.Group, labels[labelGroupName]) - } -} - -func TestConvertTFJobToUnstructured(t *testing.T) { - testName := "test-tfjob" - testUID := types.UID("test-UID") - tfJob := &tfv1.TFJob{ - TypeMeta: metav1.TypeMeta{ - Kind: tfv1.Kind, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: testName, - UID: testUID, - }, - } - - _, err := testutil.ConvertTFJobToUnstructured(tfJob) - if err != nil { - t.Errorf("Expected error to be nil while got %v", err) - } -} diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index 97f60bf97e..d170bfa3f5 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -61,7 +61,7 @@ import ( ) const ( - controllerName = "xgboostjob-operator" + controllerName = "xgboostjob-controller" // Reasons for job events. FailedDeleteJobReason = "FailedDeleteJob" diff --git a/py/kubeflow/tf_operator/release.py b/py/kubeflow/tf_operator/release.py index df3b40bfbc..916349ebcf 100755 --- a/py/kubeflow/tf_operator/release.py +++ b/py/kubeflow/tf_operator/release.py @@ -145,11 +145,11 @@ def build_operator_image(root_dir, commit = build_and_push_image.GetGitHash(root_dir) targets = [ - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1", + "github.com/kubeflow/tf-operator/cmd/training-operator.v1", ] for t in targets: if t in [ - "github.com/kubeflow/tf-operator/cmd/tf-operator.v1" + "github.com/kubeflow/tf-operator/cmd/training-operator.v1" ]: util.run([ "go", "install", "-ldflags", @@ -169,7 +169,7 @@ def build_operator_image(root_dir, # List of paths to copy relative to root. sources = [ "build/images/tf_operator/Dockerfile", "examples/tf_sample/tf_smoke.py", - os.path.join(go_path, bin_path, "tf-operator.v1"), + os.path.join(go_path, bin_path, "training-operator.v1"), "cmd", "pkg", "third_party", "vendor", "go.mod", "go.sum" ]