Skip to content

Commit

Permalink
chore: expose determined_master_host and friends in helm and default …
Browse files Browse the repository at this point in the history
…it more reasonably
  • Loading branch information
stoksc committed Oct 21, 2024
1 parent b70a622 commit 4d68c85
Show file tree
Hide file tree
Showing 15 changed files with 87 additions and 44 deletions.
4 changes: 2 additions & 2 deletions .circleci/devcluster/multi-k8s.devcluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ stages:
slot_resource_requests:
cpu: 1
kubeconfig_path: /tmp/defaultrm-kubeconf
determined_master_ip: $DOCKER_LOCALHOST
determined_master_host: $DOCKER_LOCALHOST
determined_master_port: 8080
internal_task_gateway:
gateway_name: contour
Expand All @@ -60,7 +60,7 @@ stages:
slot_resource_requests:
cpu: 1
kubeconfig_path: /tmp/additionalrm-kubeconf
determined_master_ip: $DOCKER_LOCALHOST
determined_master_host: $DOCKER_LOCALHOST
determined_master_port: 8080
resource_pools:
- pool_name: additional_pool
2 changes: 1 addition & 1 deletion .circleci/devcluster/single-k8s.devcluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ stages:
slot_resource_requests:
cpu: 1
kubeconfig_path: ~/.kube/config
determined_master_ip: $DOCKER_LOCALHOST
determined_master_host: $DOCKER_LOCALHOST
determined_master_port: 8080
8 changes: 8 additions & 0 deletions docs/release-notes/add-host-port-scheme-to-helm.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
:orphan:

**New Features**

- Helm: Support configuring `determined_master_host`, `determined_master_port`, and `determined_master_scheme`. These
control how tasks address the Determined API server and are useful when installations span multiple Kubernetes
clusters or there are proxies in between tasks and the master. Also, `determined_master_host` now defaults to
the service host, `<det_namespace>.<det_service_name>.svc.cluster.local`, instead of the service IP.
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ the same as the “cluster name” for a given cluster.

If an additional resource manager needs to connect to the Determined master through a gateway
requiring TLS, ``resource_manager.determined_master_scheme`` should be set to ``https``. If
``resource_manager.determined_master_scheme`` is not set ``determined_master_ip`` will assume
``resource_manager.determined_master_scheme`` is not set ``determined_master_host`` will assume
``https`` if the master is terminating TLS and ``http`` otherwise.

*******
Expand Down
15 changes: 15 additions & 0 deletions helm/charts/determined/templates/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,21 @@ stringData:
fluent:
{{- toYaml .Values.fluent | nindent 8}}
{{- end }}
{{- if .Values.determinedMasterHost }}
{{- if .Values.determinedMasterScheme }}
determined_master_scheme: {{ .Values.determinedMasterScheme | quote }}
{{- else if .Values.tlsSecret }}
determined_master_scheme: "https"
{{- else }}
determined_master_scheme: "http"
{{- end }}
determined_master_host: {{ .Values.determinedMasterHost | quote }}
{{- if .Values.determinedMasterPort }}
determined_master_port: {{ .Values.determinedMasterPort }}
{{- else }}
determined_master_port: {{ .Values.masterPort }}
{{- end }}
{{- end }}
default_aux_resource_pool: {{.Values.defaultAuxResourcePool}}
default_compute_resource_pool: {{.Values.defaultComputeResourcePool}}
Expand Down
12 changes: 11 additions & 1 deletion helm/charts/determined/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,16 @@ resourcePools:
## Configure the initial user password for the cluster
# initialUserPassword:

# determinedMasterHost configures the hostname that tasks launched by the primary resource manager use when
# communicating with our API server. This is useful when installations span multiple Kubernetes clusters and when there
# are proxies in between tasks and the master. It defaults to `<det_namespace>.<det_service_name>.svc.cluster.local`.
# determinedMasterHost:
# determinedMasterPort configures the port for the host above. It defaults to `masterPort` specified elsewhere. Must
# be used with determinedMasterHost or it is ineffective.
# determinedMasterPort:
# determinedMasterScheme configures the scheme for the host and port above. It defaults to `https` if our API server
# is deployed with TLS, else `http`. Must be used with determinedMasterHost or it is ineffective.
# determinedMasterScheme:
# additional_resource_managers:
# - resource_manager:
# type: kubernetes
Expand All @@ -417,7 +427,7 @@ resourcePools:
# default_namespace: default
# kubeconfig_secret_name: additionalrm
# kubeconfig_secret_value: config
# determined_master_ip: 10.11.12.13
# determined_master_host: 10.11.12.13
# determined_master_port: 8080
# resource_pools:
# - pool_name: additional_pool
Expand Down
42 changes: 26 additions & 16 deletions master/internal/config/resource_manager_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,11 @@ type KubernetesResourceManagerConfig struct {
KubeconfigPath string `json:"kubeconfig_path"`

DetMasterScheme string `json:"determined_master_scheme,omitempty"`
DetMasterIP string `json:"determined_master_ip,omitempty"`
DetMasterPort int32 `json:"determined_master_port,omitempty"`
// DeprecatedDetMasterHost shouldn't be used. Use the method DetMasterHost instead.
DeprecatedDetMasterHost string `json:"determined_master_host,omitempty"`
// DeprecatedDetMasterIP shouldn't be used. Use the method DetMasterHost instead.
DeprecatedDetMasterIP string `json:"determined_master_ip,omitempty"`
DetMasterPort int32 `json:"determined_master_port,omitempty"`

DefaultAuxResourcePool string `json:"default_aux_resource_pool"`
DefaultComputeResourcePool string `json:"default_compute_resource_pool"`
Expand All @@ -203,6 +206,15 @@ type KubernetesResourceManagerConfig struct {
Metadata map[string]string `json:"metadata"`
}

// DetMasterHost returns `det_master_host` from the config, falling back to the older `det_master_ip`. Callers
// should use this method instead the fields directly but the fields must be public for YAML deserialization to work.
func (k KubernetesResourceManagerConfig) DetMasterHost() string {
if k.DeprecatedDetMasterHost != "" {
return k.DeprecatedDetMasterHost
}
return k.DeprecatedDetMasterIP
}

// InternalTaskGatewayConfig is config for exposing Determined tasks to outside of the cluster.
// Useful for multirm when we can only be running in a single cluster.
type InternalTaskGatewayConfig struct {
Expand Down Expand Up @@ -294,33 +306,31 @@ func (k *KubernetesResourceManagerConfig) UnmarshalJSON(data []byte) error {

// Validate implements the check.Validatable interface.
func (k KubernetesResourceManagerConfig) Validate() []error {
var checkSlotType error
var errs []error
switch k.SlotType {
case device.CPU, device.CUDA, device.ROCM:
default:
checkSlotType = errors.Errorf("slot_type must be cuda, cpu, or rocm")
errs = append(errs, errors.New("slot_type must be cuda, cpu, or rocm"))
}

var checkCPUResource error
if k.SlotType == device.CPU {
checkCPUResource = check.GreaterThan(
k.SlotResourceRequests.CPU, float32(0), "slot_resource_requests.cpu must be > 0")
errs = append(errs, check.GreaterThan(
k.SlotResourceRequests.CPU, float32(0), "slot_resource_requests.cpu must be > 0"))
}

var checkRMScheduler error
if k.DefaultScheduler == PriorityScheduling {
checkRMScheduler = fmt.Errorf("the ``priority`` scheduler was deprecated, please " +
"use the default Kubernetes scheduler or coscheduler")
errs = append(errs, errors.New("the ``priority`` scheduler was deprecated, please "+
"use the default Kubernetes scheduler or coscheduler"))
} else if k.DefaultScheduler != "" && k.DefaultScheduler != "coscheduler" {
checkRMScheduler = fmt.Errorf("only blank or ``coscheduler`` values allowed for Kubernetes scheduler")
errs = append(errs, errors.New("only blank or ``coscheduler`` values allowed for Kubernetes scheduler"))
}

return []error{
checkSlotType,
checkCPUResource,
check.NotEmpty(k.ClusterName, "cluster_name is required"),
checkRMScheduler,
if k.DeprecatedDetMasterHost != "" && k.DeprecatedDetMasterIP != "" {
errs = append(errs, errors.New("use the new determined_master_host instead of determined_master_ip, not both"))
}

errs = append(errs, check.NotEmpty(k.ClusterName, "cluster_name is required"))
return errs
}

// PodSlotResourceRequests contains the per-slot container requests.
Expand Down
6 changes: 3 additions & 3 deletions master/internal/rm/kubernetesrm/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ type podNodeInfo struct {
type job struct {
// Configuration details. Set in initialization (the `newJob` constructor) and never modified after.
clusterID string
masterIP string
masterHost string
masterPort int32
masterScheme string
masterTLSConfig model.TLSClientConfig
Expand Down Expand Up @@ -133,7 +133,7 @@ func newJob(
clusterID string,
clientSet k8sClient.Interface,
namespace string,
masterIP string,
masterHost string,
masterPort int32,
masterScheme string,
masterTLSConfig model.TLSClientConfig,
Expand All @@ -156,7 +156,7 @@ func newJob(
allocationID: msg.allocationID,
clientSet: clientSet,
namespace: namespace,
masterIP: masterIP,
masterHost: masterHost,
masterPort: masterPort,
masterScheme: masterScheme,
masterTLSConfig: masterTLSConfig,
Expand Down
22 changes: 11 additions & 11 deletions master/internal/rm/kubernetesrm/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ type jobsService struct {
baseContainerDefaults *model.TaskContainerDefaultsConfig
masterServiceName string
masterTLSConfig model.TLSClientConfig
detMasterIP string
detMasterHost string
detMasterPort int32
detMasterScheme string
kubeconfigPath string
Expand Down Expand Up @@ -175,7 +175,7 @@ func newJobsService(
slotResourceRequests config.PodSlotResourceRequests,
resourcePoolConfigs []config.ResourcePoolConfig,
taskContainerDefaults *model.TaskContainerDefaultsConfig,
detMasterIP string,
detMasterHost string,
detMasterPort int32,
detMasterScheme string,
kubeconfigPath string,
Expand All @@ -200,7 +200,7 @@ func newJobsService(
slotResourceRequests: slotResourceRequests,
resourcePoolConfigs: resourcePoolConfigs,
baseContainerDefaults: taskContainerDefaults,
detMasterIP: detMasterIP,
detMasterHost: detMasterHost,
detMasterPort: detMasterPort,
currentNodes: make(map[string]*k8sV1.Node),
nodeToSystemResourceRequests: make(map[string]int64),
Expand All @@ -225,7 +225,7 @@ func newJobsService(
if err := p.startClientSet(ns); err != nil {
return nil, err
}
if err := p.getMasterIPAndPort(); err != nil {
if err := p.getMasterHostAndPort(); err != nil {
return nil, err
}
if err := p.getSystemResourceRequests(); err != nil {
Expand Down Expand Up @@ -419,22 +419,22 @@ func readClientConfig(kubeconfigPath string) (*rest.Config, error) {
return cl, nil
}

func (j *jobsService) getMasterIPAndPort() error {
if j.detMasterIP != "" && j.detMasterPort != 0 {
func (j *jobsService) getMasterHostAndPort() error {
if j.detMasterHost != "" && j.detMasterPort != 0 {
// Master ip and port were manually configured. For special circumstances, e.g., the master is running
// outside of this cluster (happens in development or when we spread across multiple k8s clusters).
return nil
}

masterService, err := j.clientSet.CoreV1().
Services(j.getInitialNamespace()).
Get(context.TODO(), j.masterServiceName, metaV1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get master service: %w", err)
}

j.detMasterIP = masterService.Spec.ClusterIP
j.detMasterHost = fmt.Sprintf("%s.%s.svc.cluster.local", masterService.Namespace, masterService.Name)
j.detMasterPort = masterService.Spec.Ports[0].Port
j.syslog.Infof("master URL set to %s:%d", j.detMasterIP, j.detMasterPort)
j.syslog.Infof("master URL set to %s:%d", j.detMasterHost, j.detMasterPort)
return nil
}

Expand Down Expand Up @@ -617,7 +617,7 @@ func (j *jobsService) startJob(msg startJob) error {
msg.spec.ClusterID,
j.clientSet,
msg.namespace,
j.detMasterIP,
j.detMasterHost,
j.detMasterPort,
j.detMasterScheme,
j.masterTLSConfig,
Expand Down Expand Up @@ -908,7 +908,7 @@ func (j *jobsService) recreateJobHandler(
startMsg.spec.ClusterID,
j.clientSet,
job.Namespace,
j.detMasterIP,
j.detMasterHost,
j.detMasterPort,
j.detMasterScheme,
j.masterTLSConfig,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func New(
config.PodSlotResourceRequests{CPU: k.config.SlotResourceRequests.CPU},
k.poolsConfig,
k.taskContainerDefaults,
k.config.DetMasterIP,
k.config.DetMasterHost(),
k.config.DetMasterPort,
k.config.DetMasterScheme,
k.config.KubeconfigPath,
Expand Down
6 changes: 3 additions & 3 deletions master/internal/rm/kubernetesrm/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,9 @@ func (j *job) configureEnvVars(
}

envVarsMap["DET_CLUSTER_ID"] = j.clusterID
envVarsMap["DET_MASTER"] = fmt.Sprintf("%s://%s:%d", masterScheme, j.masterIP, j.masterPort)
envVarsMap["DET_MASTER_HOST"] = j.masterIP
envVarsMap["DET_MASTER_ADDR"] = j.masterIP
envVarsMap["DET_MASTER"] = fmt.Sprintf("%s://%s:%d", masterScheme, j.masterHost, j.masterPort)
envVarsMap["DET_MASTER_HOST"] = j.masterHost
envVarsMap["DET_MASTER_ADDR"] = j.masterHost
envVarsMap["DET_MASTER_PORT"] = strconv.Itoa(int(j.masterPort))
envVarsMap["DET_SLOT_IDS"] = fmt.Sprintf("[%s]", strings.Join(slotIDs, ","))
if j.masterTLSConfig.CertificateName != "" {
Expand Down
2 changes: 1 addition & 1 deletion master/internal/rm/kubernetesrm/spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ func TestDetMasterEnvVar(t *testing.T) {
}

j := &job{
masterIP: "example.com",
masterHost: "example.com",
masterPort: 1234,
masterScheme: c.masterScheme,
masterTLSConfig: c.masterTLSConfig,
Expand Down
2 changes: 1 addition & 1 deletion tools/k8s/devcluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ stages:
slot_resource_requests:
cpu: 1
kubeconfig_path: ~/.kube/config
determined_master_ip: $DOCKER_LOCALHOST
determined_master_host: $DOCKER_LOCALHOST
determined_master_port: 8080

# Example custom stage running the coscheduler in a docker container. In real
Expand Down
4 changes: 2 additions & 2 deletions tools/k8s/multicluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ stages:
slot_resource_requests:
cpu: 1
kubeconfig_path: ~/.kube/config
determined_master_ip: $DOCKER_LOCALHOST
determined_master_host: $DOCKER_LOCALHOST
determined_master_port: 8080

additional_resource_managers:
Expand All @@ -72,7 +72,7 @@ stages:
slot_resource_requests:
cpu: 1
kubeconfig_path: ~/.kube/extraconfig
determined_master_ip: $DOCKER_LOCALHOST
determined_master_host: $DOCKER_LOCALHOST
determined_master_port: 8080
resource_pools:
- pool_name: extra
2 changes: 1 addition & 1 deletion tools/k8s/remote_connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def update_devcluster(cfg: Config, gateway: Gateway, remote_port: int) -> pathli
+ "These will be ignored."
)
assert resource_manager["type"] == "kubernetes"
resource_manager["determined_master_ip"] = cfg.reverse_proxy_host
resource_manager["determined_master_host"] = cfg.reverse_proxy_host
resource_manager["determined_master_port"] = remote_port
assert gateway.ip is not None, "Gateway IP is not set"
resource_manager.update(gateway.to_config())
Expand Down

0 comments on commit 4d68c85

Please sign in to comment.