From 4e374c3e90792ec08523545d934775bc28873481 Mon Sep 17 00:00:00 2001 From: Bradley Laney Date: Mon, 28 Oct 2024 21:17:06 -0400 Subject: [PATCH] feat: determined_master_host and friends helm support, better defaults (#10138) When `determined_master_ip` is unsettable via Helm and defaults to the service IP, life with proxies is hard. This change renames `determined_master_ip` to `determined_master_host` everywhere with some backwards compatibility, defaults `determined_master_host` to `..svc.cluster.local`, and makes all of this overridable in Helm. --- .../devcluster/multi-k8s.devcluster.yaml | 4 +- .../devcluster/single-k8s.devcluster.yaml | 2 +- .../add-host-port-scheme-to-helm.rst | 9 ++ .../k8s/setup-multiple-resource-managers.rst | 2 +- .../determined/templates/master-config.yaml | 15 +++ helm/charts/determined/values.yaml | 12 +- master/cmd/determined-master/root.go | 103 +++++++++++++----- master/cmd/determined-master/root_test.go | 83 ++++++++++++++ .../config/resource_manager_config.go | 26 ++--- master/internal/rm/kubernetesrm/job.go | 6 +- master/internal/rm/kubernetesrm/jobs.go | 22 ++-- .../kubernetes_resource_manager.go | 2 +- master/internal/rm/kubernetesrm/spec.go | 6 +- master/internal/rm/kubernetesrm/spec_test.go | 2 +- tools/k8s/devcluster.yaml | 2 +- tools/k8s/multicluster.yaml | 4 +- tools/k8s/remote_connect.py | 2 +- 17 files changed, 228 insertions(+), 74 deletions(-) create mode 100644 docs/release-notes/add-host-port-scheme-to-helm.rst diff --git a/.circleci/devcluster/multi-k8s.devcluster.yaml b/.circleci/devcluster/multi-k8s.devcluster.yaml index 3f743134c87..8651f126642 100644 --- a/.circleci/devcluster/multi-k8s.devcluster.yaml +++ b/.circleci/devcluster/multi-k8s.devcluster.yaml @@ -39,7 +39,7 @@ stages: slot_resource_requests: cpu: 1 kubeconfig_path: /tmp/defaultrm-kubeconf - determined_master_ip: $DOCKER_LOCALHOST + determined_master_host: $DOCKER_LOCALHOST determined_master_port: 8080 internal_task_gateway: gateway_name: contour @@ -60,7 +60,7 @@ stages: slot_resource_requests: cpu: 1 kubeconfig_path: /tmp/additionalrm-kubeconf - determined_master_ip: $DOCKER_LOCALHOST + determined_master_host: $DOCKER_LOCALHOST determined_master_port: 8080 resource_pools: - pool_name: additional_pool diff --git a/.circleci/devcluster/single-k8s.devcluster.yaml b/.circleci/devcluster/single-k8s.devcluster.yaml index 646f9ff6a14..04dc8497dcd 100644 --- a/.circleci/devcluster/single-k8s.devcluster.yaml +++ b/.circleci/devcluster/single-k8s.devcluster.yaml @@ -33,5 +33,5 @@ stages: slot_resource_requests: cpu: 1 kubeconfig_path: ~/.kube/config - determined_master_ip: $DOCKER_LOCALHOST + determined_master_host: $DOCKER_LOCALHOST determined_master_port: 8080 diff --git a/docs/release-notes/add-host-port-scheme-to-helm.rst b/docs/release-notes/add-host-port-scheme-to-helm.rst new file mode 100644 index 00000000000..d0f49a72c86 --- /dev/null +++ b/docs/release-notes/add-host-port-scheme-to-helm.rst @@ -0,0 +1,9 @@ +:orphan: + +**New Features** + +- Helm: Support configuring ``determined_master_host``, ``determined_master_port``, and + ``determined_master_scheme``. These control how tasks address the Determined API server and are + useful when installations span multiple Kubernetes clusters or there are proxies in between tasks + and the master. Also, ``determined_master_host`` now defaults to the service host, + ``..svc.cluster.local``, instead of the service IP. diff --git a/docs/setup-cluster/k8s/setup-multiple-resource-managers.rst b/docs/setup-cluster/k8s/setup-multiple-resource-managers.rst index ff0869ad9fa..b7a12d175d0 100644 --- a/docs/setup-cluster/k8s/setup-multiple-resource-managers.rst +++ b/docs/setup-cluster/k8s/setup-multiple-resource-managers.rst @@ -156,7 +156,7 @@ the same as the “cluster name” for a given cluster. If an additional resource manager needs to connect to the Determined master through a gateway requiring TLS, ``resource_manager.determined_master_scheme`` should be set to ``https``. If - ``resource_manager.determined_master_scheme`` is not set ``determined_master_ip`` will assume + ``resource_manager.determined_master_scheme`` is not set ``determined_master_host`` will assume ``https`` if the master is terminating TLS and ``http`` otherwise. ******* diff --git a/helm/charts/determined/templates/master-config.yaml b/helm/charts/determined/templates/master-config.yaml index e935cacec92..051473c0054 100644 --- a/helm/charts/determined/templates/master-config.yaml +++ b/helm/charts/determined/templates/master-config.yaml @@ -224,6 +224,21 @@ stringData: fluent: {{- toYaml .Values.fluent | nindent 8}} {{- end }} + {{- if .Values.determinedMasterHost }} + {{- if .Values.determinedMasterScheme }} + determined_master_scheme: {{ .Values.determinedMasterScheme | quote }} + {{- else if .Values.tlsSecret }} + determined_master_scheme: "https" + {{- else }} + determined_master_scheme: "http" + {{- end }} + determined_master_host: {{ .Values.determinedMasterHost | quote }} + {{- if .Values.determinedMasterPort }} + determined_master_port: {{ .Values.determinedMasterPort }} + {{- else }} + determined_master_port: {{ .Values.masterPort }} + {{- end }} + {{- end }} default_aux_resource_pool: {{.Values.defaultAuxResourcePool}} default_compute_resource_pool: {{.Values.defaultComputeResourcePool}} diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 58ad92cc512..1104896c9bc 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -413,6 +413,16 @@ resourcePools: ## Configure the initial user password for the cluster # initialUserPassword: +# determinedMasterHost configures the hostname that tasks launched by the primary resource manager use when +# communicating with our API server. This is useful when installations span multiple Kubernetes clusters and when there +# are proxies in between tasks and the master. It defaults to `..svc.cluster.local`. +# determinedMasterHost: +# determinedMasterPort configures the port for the host above. It defaults to `masterPort` specified elsewhere. Must +# be used with determinedMasterHost or it is ineffective. +# determinedMasterPort: +# determinedMasterScheme configures the scheme for the host and port above. It defaults to `https` if our API server +# is deployed with TLS, else `http`. Must be used with determinedMasterHost or it is ineffective. +# determinedMasterScheme: # additional_resource_managers: # - resource_manager: # type: kubernetes @@ -421,7 +431,7 @@ resourcePools: # default_namespace: default # kubeconfig_secret_name: additionalrm # kubeconfig_secret_value: config -# determined_master_ip: 10.11.12.13 +# determined_master_host: 10.11.12.13 # determined_master_port: 8080 # resource_pools: # - pool_name: additional_pool diff --git a/master/cmd/determined-master/root.go b/master/cmd/determined-master/root.go index fdca1f180f2..567c428928a 100644 --- a/master/cmd/determined-master/root.go +++ b/master/cmd/determined-master/root.go @@ -185,23 +185,20 @@ func getConfig(configMap map[string]interface{}) (*config.Config, error) { } func applyBackwardsCompatibility(configMap map[string]interface{}) (map[string]interface{}, error) { - // Preemption timeout moved from __internal to task_container_defaults - if internalMap, ok := configMap["__internal"].(map[string]interface{}); ok { - if oldPreemptTimeout, ok := internalMap["preemption_timeout"]; ok && oldPreemptTimeout != nil { - if preemptionDuration, err := time.ParseDuration(oldPreemptTimeout.(string)); err == nil { - preemptionTimeoutSeconds := int(preemptionDuration.Seconds()) - if taskContainerMap, ok := configMap["task_container_defaults"].(map[string]interface{}); ok { - // Only set task_container_defaults from __internal if nil - if taskContainerTimeout, ok := taskContainerMap["preemption_timeout"]; !ok || taskContainerTimeout == nil { - taskContainerMap["preemption_timeout"] = preemptionTimeoutSeconds - configMap["task_container_defaults"] = taskContainerMap - } - } - } - } - delete(internalMap, "preemption_timeout") + err := shimOldRMConfig(configMap) + if err != nil { + return nil, err } + shimPreemptionTimeout(configMap) + shimDeterminedMasterIP(configMap) + return configMap, nil +} + +// Ensure we use either the old schema or the new one. +// Use configMap if RMs are not defined at all, or if they are defined using the new schema. +// If use the old schema, convert it to the new one. +func shimOldRMConfig(configMap map[string]interface{}) error { const ( defaultVal = "default" agentVal = "agent" @@ -213,20 +210,17 @@ func applyBackwardsCompatibility(configMap map[string]interface{}) (map[string]i vScheduler, schedulerExisted := configMap["scheduler"] vProvisioner, provisionerExisted := configMap["provisioner"] - // Ensure we use either the old schema or the new one. oldRMConfig := schedulerExisted || provisionerExisted newRMConfig := rmExisted || rpsExisted if newRMConfig && oldRMConfig { - return nil, errors.New( + return errors.New( "cannot use the old and the new configuration schema at the same time", ) } if !oldRMConfig { - // Use configMap if RMs are not defined at all, or if they are defined using the new schema. - return configMap, nil + return nil } - // If use the old schema, convert it to the new one. newScheduler := map[string]interface{}{ "type": "priority", "fitting_policy": "best", @@ -237,38 +231,38 @@ func applyBackwardsCompatibility(configMap map[string]interface{}) (map[string]i if schedulerExisted { schedulerMap, ok := vScheduler.(map[string]interface{}) if !ok { - return nil, errors.New("wrong type for scheduler field") + return errors.New("wrong type for scheduler field") } if vFit, ok := schedulerMap["fit"]; ok { newScheduler["fitting_policy"], ok = vFit.(string) if !ok { - return nil, errors.New("wrong type for scheduler.fit field") + return errors.New("wrong type for scheduler.fit field") } } if vType, ok := schedulerMap["type"]; ok { newScheduler["type"], ok = vType.(string) if !ok { - return nil, errors.New("wrong type for scheduler.type field") + return errors.New("wrong type for scheduler.type field") } } if vRP, ok := schedulerMap["resource_provider"]; ok { rpMap, ok := vRP.(map[string]interface{}) if !ok { - return nil, errors.New("wrong type for scheduler.resource_provider field") + return errors.New("wrong type for scheduler.resource_provider field") } vRPType, ok := rpMap["type"] if ok { switch vRPTypeStr, ok := vRPType.(string); { case !ok: - return nil, errors.New("wrong type for scheduler.resource_provider.type field") + return errors.New("wrong type for scheduler.resource_provider.type field") case vRPTypeStr == defaultVal: newRM["type"] = agentVal case vRPTypeStr == kubernetesVal: newRM["type"] = kubernetesVal default: - return nil, errors.New("wrong value for scheduler.resource_provider.type field") + return errors.New("wrong value for scheduler.resource_provider.type field") } } else { newRM["type"] = agentVal @@ -298,18 +292,18 @@ func applyBackwardsCompatibility(configMap map[string]interface{}) (map[string]i if provisionerExisted { provisionerMap, ok := vProvisioner.(map[string]interface{}) if !ok { - return nil, errors.New("wrong type for provisioner field") + return errors.New("wrong type for provisioner field") } newRP["provider"] = provisionerMap if vProvider, ok := provisionerMap["provider"]; ok { vProviderStr, ok := vProvider.(string) if !ok { - return nil, errors.New("wrong type for provisioner.provider field") + return errors.New("wrong type for provisioner.provider field") } if vProviderStr != "aws" && vProviderStr != "gcp" { - return nil, errors.New("wrong value for provisioner.provider field") + return errors.New("wrong value for provisioner.provider field") } provisionerMap["type"] = provisionerMap["provider"] @@ -321,6 +315,55 @@ func applyBackwardsCompatibility(configMap map[string]interface{}) (map[string]i delete(configMap, "scheduler") delete(configMap, "provisioner") + return nil +} - return configMap, nil +// Preemption timeout moved from __internal to task_container_defaults +// Only set task_container_defaults from __internal if nil. +func shimPreemptionTimeout(configMap map[string]interface{}) { + if internalMap, ok := configMap["__internal"].(map[string]interface{}); ok { + if oldPreemptTimeout, ok := internalMap["preemption_timeout"]; ok && oldPreemptTimeout != nil { + if preemptionDuration, err := time.ParseDuration(oldPreemptTimeout.(string)); err == nil { + preemptionTimeoutSeconds := int(preemptionDuration.Seconds()) + if taskContainerMap, ok := configMap["task_container_defaults"].(map[string]interface{}); ok { + if taskContainerTimeout, ok := taskContainerMap["preemption_timeout"]; !ok || taskContainerTimeout == nil { + taskContainerMap["preemption_timeout"] = preemptionTimeoutSeconds + configMap["task_container_defaults"] = taskContainerMap + } + } + } + } + delete(internalMap, "preemption_timeout") + } +} + +// Rename from `determined_master_ip` to `determined_master_host` in old KubernetesRM configs. +func shimDeterminedMasterIP(configMap map[string]interface{}) { + shimDeterminedMasterIPInRM := func(rm map[string]any) { + if rm["type"] != "kubernetes" { + return + } + if ip, ok := rm["determined_master_ip"]; ok { + if _, ok := rm["determined_master_host"]; !ok { + rm["determined_master_host"] = ip + delete(rm, "determined_master_ip") + } else { + log.Warn("ignoring duplicated configuration `determined_master_ip`") + delete(rm, "determined_master_ip") + } + } + } + if rmi, ok := configMap["resource_manager"]; ok { + if rm, ok := rmi.(map[string]any); ok { + shimDeterminedMasterIPInRM(rm) + } + } + if rmis, ok := configMap["additional_resource_managers"]; ok { + rms, ok := rmis.([]map[string]any) + if ok { + for _, rm := range rms { + shimDeterminedMasterIPInRM(rm) + } + } + } } diff --git a/master/cmd/determined-master/root_test.go b/master/cmd/determined-master/root_test.go index 06d98ec34c1..1bd62ff1888 100644 --- a/master/cmd/determined-master/root_test.go +++ b/master/cmd/determined-master/root_test.go @@ -228,6 +228,89 @@ func TestApplyBackwardsCompatibility(t *testing.T) { }, }, }, + { + name: "determined master ip to host rename, ip set", + before: map[string]interface{}{ + "resource_manager": map[string]interface{}{ + "type": "kubernetes", + "determined_master_ip": "10.0.0.1", + }, + "additional_resource_managers": []map[string]any{ + { + "type": "kubernetes", + "determined_master_ip": "10.0.0.2", + }, + }, + }, + expected: map[string]interface{}{ + "resource_manager": map[string]interface{}{ + "type": "kubernetes", + "determined_master_host": "10.0.0.1", + }, + "additional_resource_managers": []map[string]any{ + { + "type": "kubernetes", + "determined_master_host": "10.0.0.2", + }, + }, + }, + }, + { + name: "determined master ip to host rename, both set", + before: map[string]interface{}{ + "resource_manager": map[string]interface{}{ + "type": "kubernetes", + "determined_master_host": "10.0.0.1", + "determined_master_ip": "10.0.0.1", + }, + "additional_resource_managers": []map[string]any{ + { + "type": "kubernetes", + "determined_master_host": "10.0.0.2", + "determined_master_ip": "10.0.0.2", + }, + }, + }, + expected: map[string]interface{}{ + "resource_manager": map[string]interface{}{ + "type": "kubernetes", + "determined_master_host": "10.0.0.1", + }, + "additional_resource_managers": []map[string]any{ + { + "type": "kubernetes", + "determined_master_host": "10.0.0.2", + }, + }, + }, + }, + { + name: "determined master ip to host rename, host set get left alone", + before: map[string]interface{}{ + "resource_manager": map[string]interface{}{ + "type": "kubernetes", + "determined_master_host": "10.0.0.1", + }, + "additional_resource_managers": []map[string]any{ + { + "type": "kubernetes", + "determined_master_host": "10.0.0.2", + }, + }, + }, + expected: map[string]interface{}{ + "resource_manager": map[string]interface{}{ + "type": "kubernetes", + "determined_master_host": "10.0.0.1", + }, + "additional_resource_managers": []map[string]any{ + { + "type": "kubernetes", + "determined_master_host": "10.0.0.2", + }, + }, + }, + }, } for ix := range tcs { tc := tcs[ix] diff --git a/master/internal/config/resource_manager_config.go b/master/internal/config/resource_manager_config.go index cd13d711e68..a6455747624 100644 --- a/master/internal/config/resource_manager_config.go +++ b/master/internal/config/resource_manager_config.go @@ -188,7 +188,7 @@ type KubernetesResourceManagerConfig struct { KubeconfigPath string `json:"kubeconfig_path"` DetMasterScheme string `json:"determined_master_scheme,omitempty"` - DetMasterIP string `json:"determined_master_ip,omitempty"` + DetMasterHost string `json:"determined_master_host,omitempty"` DetMasterPort int32 `json:"determined_master_port,omitempty"` DefaultAuxResourcePool string `json:"default_aux_resource_pool"` @@ -294,33 +294,27 @@ func (k *KubernetesResourceManagerConfig) UnmarshalJSON(data []byte) error { // Validate implements the check.Validatable interface. func (k KubernetesResourceManagerConfig) Validate() []error { - var checkSlotType error + var errs []error switch k.SlotType { case device.CPU, device.CUDA, device.ROCM: default: - checkSlotType = errors.Errorf("slot_type must be cuda, cpu, or rocm") + errs = append(errs, errors.New("slot_type must be cuda, cpu, or rocm")) } - var checkCPUResource error if k.SlotType == device.CPU { - checkCPUResource = check.GreaterThan( - k.SlotResourceRequests.CPU, float32(0), "slot_resource_requests.cpu must be > 0") + errs = append(errs, check.GreaterThan( + k.SlotResourceRequests.CPU, float32(0), "slot_resource_requests.cpu must be > 0")) } - var checkRMScheduler error if k.DefaultScheduler == PriorityScheduling { - checkRMScheduler = fmt.Errorf("the ``priority`` scheduler was deprecated, please " + - "use the default Kubernetes scheduler or coscheduler") + errs = append(errs, errors.New("the ``priority`` scheduler was deprecated, please "+ + "use the default Kubernetes scheduler or coscheduler")) } else if k.DefaultScheduler != "" && k.DefaultScheduler != "coscheduler" { - checkRMScheduler = fmt.Errorf("only blank or ``coscheduler`` values allowed for Kubernetes scheduler") + errs = append(errs, errors.New("only blank or ``coscheduler`` values allowed for Kubernetes scheduler")) } - return []error{ - checkSlotType, - checkCPUResource, - check.NotEmpty(k.ClusterName, "cluster_name is required"), - checkRMScheduler, - } + errs = append(errs, check.NotEmpty(k.ClusterName, "cluster_name is required")) + return errs } // PodSlotResourceRequests contains the per-slot container requests. diff --git a/master/internal/rm/kubernetesrm/job.go b/master/internal/rm/kubernetesrm/job.go index dadca6b8317..c796c75bf0c 100644 --- a/master/internal/rm/kubernetesrm/job.go +++ b/master/internal/rm/kubernetesrm/job.go @@ -80,7 +80,7 @@ type podNodeInfo struct { type job struct { // Configuration details. Set in initialization (the `newJob` constructor) and never modified after. clusterID string - masterIP string + masterHost string masterPort int32 masterScheme string masterTLSConfig model.TLSClientConfig @@ -133,7 +133,7 @@ func newJob( clusterID string, clientSet k8sClient.Interface, namespace string, - masterIP string, + masterHost string, masterPort int32, masterScheme string, masterTLSConfig model.TLSClientConfig, @@ -156,7 +156,7 @@ func newJob( allocationID: msg.allocationID, clientSet: clientSet, namespace: namespace, - masterIP: masterIP, + masterHost: masterHost, masterPort: masterPort, masterScheme: masterScheme, masterTLSConfig: masterTLSConfig, diff --git a/master/internal/rm/kubernetesrm/jobs.go b/master/internal/rm/kubernetesrm/jobs.go index 1f31ef289bc..790b90dd014 100644 --- a/master/internal/rm/kubernetesrm/jobs.go +++ b/master/internal/rm/kubernetesrm/jobs.go @@ -108,7 +108,7 @@ type jobsService struct { baseContainerDefaults *model.TaskContainerDefaultsConfig masterServiceName string masterTLSConfig model.TLSClientConfig - detMasterIP string + detMasterHost string detMasterPort int32 detMasterScheme string kubeconfigPath string @@ -175,7 +175,7 @@ func newJobsService( slotResourceRequests config.PodSlotResourceRequests, resourcePoolConfigs []config.ResourcePoolConfig, taskContainerDefaults *model.TaskContainerDefaultsConfig, - detMasterIP string, + detMasterHost string, detMasterPort int32, detMasterScheme string, kubeconfigPath string, @@ -200,7 +200,7 @@ func newJobsService( slotResourceRequests: slotResourceRequests, resourcePoolConfigs: resourcePoolConfigs, baseContainerDefaults: taskContainerDefaults, - detMasterIP: detMasterIP, + detMasterHost: detMasterHost, detMasterPort: detMasterPort, currentNodes: make(map[string]*k8sV1.Node), nodeToSystemResourceRequests: make(map[string]int64), @@ -225,7 +225,7 @@ func newJobsService( if err := p.startClientSet(ns); err != nil { return nil, err } - if err := p.getMasterIPAndPort(); err != nil { + if err := p.getMasterHostAndPort(); err != nil { return nil, err } if err := p.getSystemResourceRequests(); err != nil { @@ -419,22 +419,22 @@ func readClientConfig(kubeconfigPath string) (*rest.Config, error) { return cl, nil } -func (j *jobsService) getMasterIPAndPort() error { - if j.detMasterIP != "" && j.detMasterPort != 0 { +func (j *jobsService) getMasterHostAndPort() error { + if j.detMasterHost != "" && j.detMasterPort != 0 { // Master ip and port were manually configured. For special circumstances, e.g., the master is running // outside of this cluster (happens in development or when we spread across multiple k8s clusters). return nil } + masterService, err := j.clientSet.CoreV1(). Services(j.getInitialNamespace()). Get(context.TODO(), j.masterServiceName, metaV1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get master service: %w", err) } - - j.detMasterIP = masterService.Spec.ClusterIP + j.detMasterHost = fmt.Sprintf("%s.%s.svc.cluster.local", masterService.Name, masterService.Namespace) j.detMasterPort = masterService.Spec.Ports[0].Port - j.syslog.Infof("master URL set to %s:%d", j.detMasterIP, j.detMasterPort) + j.syslog.Infof("master URL set to %s:%d", j.detMasterHost, j.detMasterPort) return nil } @@ -617,7 +617,7 @@ func (j *jobsService) startJob(msg startJob) error { msg.spec.ClusterID, j.clientSet, msg.namespace, - j.detMasterIP, + j.detMasterHost, j.detMasterPort, j.detMasterScheme, j.masterTLSConfig, @@ -908,7 +908,7 @@ func (j *jobsService) recreateJobHandler( startMsg.spec.ClusterID, j.clientSet, job.Namespace, - j.detMasterIP, + j.detMasterHost, j.detMasterPort, j.detMasterScheme, j.masterTLSConfig, diff --git a/master/internal/rm/kubernetesrm/kubernetes_resource_manager.go b/master/internal/rm/kubernetesrm/kubernetes_resource_manager.go index e558239304f..d22f62313a4 100644 --- a/master/internal/rm/kubernetesrm/kubernetes_resource_manager.go +++ b/master/internal/rm/kubernetesrm/kubernetes_resource_manager.go @@ -96,7 +96,7 @@ func New( config.PodSlotResourceRequests{CPU: k.config.SlotResourceRequests.CPU}, k.poolsConfig, k.taskContainerDefaults, - k.config.DetMasterIP, + k.config.DetMasterHost, k.config.DetMasterPort, k.config.DetMasterScheme, k.config.KubeconfigPath, diff --git a/master/internal/rm/kubernetesrm/spec.go b/master/internal/rm/kubernetesrm/spec.go index b2de65e4276..6788fe2346d 100644 --- a/master/internal/rm/kubernetesrm/spec.go +++ b/master/internal/rm/kubernetesrm/spec.go @@ -135,9 +135,9 @@ func (j *job) configureEnvVars( } envVarsMap["DET_CLUSTER_ID"] = j.clusterID - envVarsMap["DET_MASTER"] = fmt.Sprintf("%s://%s:%d", masterScheme, j.masterIP, j.masterPort) - envVarsMap["DET_MASTER_HOST"] = j.masterIP - envVarsMap["DET_MASTER_ADDR"] = j.masterIP + envVarsMap["DET_MASTER"] = fmt.Sprintf("%s://%s:%d", masterScheme, j.masterHost, j.masterPort) + envVarsMap["DET_MASTER_HOST"] = j.masterHost + envVarsMap["DET_MASTER_ADDR"] = j.masterHost envVarsMap["DET_MASTER_PORT"] = strconv.Itoa(int(j.masterPort)) envVarsMap["DET_SLOT_IDS"] = fmt.Sprintf("[%s]", strings.Join(slotIDs, ",")) if j.masterTLSConfig.CertificateName != "" { diff --git a/master/internal/rm/kubernetesrm/spec_test.go b/master/internal/rm/kubernetesrm/spec_test.go index af9454ecf33..415e280b01b 100644 --- a/master/internal/rm/kubernetesrm/spec_test.go +++ b/master/internal/rm/kubernetesrm/spec_test.go @@ -422,7 +422,7 @@ func TestDetMasterEnvVar(t *testing.T) { } j := &job{ - masterIP: "example.com", + masterHost: "example.com", masterPort: 1234, masterScheme: c.masterScheme, masterTLSConfig: c.masterTLSConfig, diff --git a/tools/k8s/devcluster.yaml b/tools/k8s/devcluster.yaml index 22c6692666c..a063495fcaa 100644 --- a/tools/k8s/devcluster.yaml +++ b/tools/k8s/devcluster.yaml @@ -51,7 +51,7 @@ stages: slot_resource_requests: cpu: 1 kubeconfig_path: ~/.kube/config - determined_master_ip: $DOCKER_LOCALHOST + determined_master_host: $DOCKER_LOCALHOST determined_master_port: 8080 # Example custom stage running the coscheduler in a docker container. In real diff --git a/tools/k8s/multicluster.yaml b/tools/k8s/multicluster.yaml index 38eaa5a4675..fa5c6207e3b 100644 --- a/tools/k8s/multicluster.yaml +++ b/tools/k8s/multicluster.yaml @@ -60,7 +60,7 @@ stages: slot_resource_requests: cpu: 1 kubeconfig_path: ~/.kube/config - determined_master_ip: $DOCKER_LOCALHOST + determined_master_host: $DOCKER_LOCALHOST determined_master_port: 8080 additional_resource_managers: @@ -72,7 +72,7 @@ stages: slot_resource_requests: cpu: 1 kubeconfig_path: ~/.kube/extraconfig - determined_master_ip: $DOCKER_LOCALHOST + determined_master_host: $DOCKER_LOCALHOST determined_master_port: 8080 resource_pools: - pool_name: extra diff --git a/tools/k8s/remote_connect.py b/tools/k8s/remote_connect.py index a0fd14d80e5..b9b6363a5fb 100755 --- a/tools/k8s/remote_connect.py +++ b/tools/k8s/remote_connect.py @@ -325,7 +325,7 @@ def update_devcluster(cfg: Config, gateway: Gateway, remote_port: int) -> pathli + "These will be ignored." ) assert resource_manager["type"] == "kubernetes" - resource_manager["determined_master_ip"] = cfg.reverse_proxy_host + resource_manager["determined_master_host"] = cfg.reverse_proxy_host resource_manager["determined_master_port"] = remote_port assert gateway.ip is not None, "Gateway IP is not set" resource_manager.update(gateway.to_config())