Skip to content
This repository has been archived by the owner on Jul 20, 2023. It is now read-only.

Commit

Permalink
Merge pull request #23 from katulu-io/GerardoGR/edge-task-restart-policy
Browse files Browse the repository at this point in the history
Keep successful FL-Operator pods (jobs) and only restart on-failure
  • Loading branch information
GerardoGR authored Jul 15, 2022
2 parents d9a8968 + 27789fc commit a147570
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 23 deletions.
23 changes: 0 additions & 23 deletions components/fl-operator/controllers/floperator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,29 +151,6 @@ func (r *FlOperatorReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}, err
}

// Then cleanup the running clients that don't have
// a matching task in the response
for _, pod := range podsList.Items {
clientRunning := false

for _, task := range tasks {
if pod.Name == getPodName(task) {
clientRunning = true
break
}
}

if !clientRunning {
log.Info(fmt.Sprintf("Cleaning up pod: %s", pod.ObjectMeta.Name))

err = r.Delete(ctx, &pod)
if err != nil {
log.Error(err, "Couldn't delete pod resource")
continue
}
}
}

for _, task := range tasks {
log.Info(fmt.Sprintf("Found run ID: %s", task.ID))

Expand Down
12 changes: 12 additions & 0 deletions components/fl-operator/pkg/resources/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ func RenderEnvoyproxyConfig(context EnvoyConfigContext, envoyConfigFile string)

// Creates new pod for the flower-client
func NewPod(task *pb.OrchestratorMessage_TaskSpec, name types.NamespacedName, envoyConfigName string) *corev1.Pod {
shareProcessNamespace := true
labels := map[string]string{
FlClientDeploymentLabelKey: FlClientDeploymentLabelValue,
"run-id": string(task.ID),
Expand All @@ -66,11 +67,19 @@ func NewPod(task *pb.OrchestratorMessage_TaskSpec, name types.NamespacedName, en
Labels: labels,
},
Spec: corev1.PodSpec{
// For the flower-client to pkill the envoyproxy as below instructed, the containers need to share the same
// process namespace.
ShareProcessNamespace: &shareProcessNamespace,
Containers: []corev1.Container{
{
Name: "flower-client",
Image: task.Executor.GetOciExecutor().Image,
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{"/bin/bash", "-c"},
// Assuming all flower-clients run with: python /app/main.py
// "pkill envoy" is needed to kill the envoyproxy once the flower-client finishes the fl-run and the
// the pod is marked as completed.
Args: []string{"python /app/main.py && pkill envoy"},
},
{
Name: "envoyproxy",
Expand Down Expand Up @@ -101,6 +110,9 @@ func NewPod(task *pb.OrchestratorMessage_TaskSpec, name types.NamespacedName, en
},
},
},
// RestartPolicy: OnFailure is preferred because it allows the pod to fail on transitive errors
// (e.g flower-client's envoyproxy not ready which causes it to crash)
RestartPolicy: "OnFailure",
Volumes: []corev1.Volume{
{
Name: envoyConfigVolumeKey,
Expand Down

0 comments on commit a147570

Please sign in to comment.