From c7b03e92a5454e63ee1b563b541f0268c64f3d01 Mon Sep 17 00:00:00 2001
From: ranchodeluxe <greg.corradini@gmail.com>
Date: Tue, 30 Jan 2024 06:43:27 -0800
Subject: [PATCH 1/2] leave failed runners around

---
 .github/workflows/job-runner.yaml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml
index 2877a83..a0abb77 100644
--- a/.github/workflows/job-runner.yaml
+++ b/.github/workflows/job-runner.yaml
@@ -252,13 +252,19 @@ jobs:
           kubectl get pod | grep -v taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} > /tmp/jobmanager.log
           cat /tmp/jobmanager.log
           echo "##################### TASK MANAGER ######################"
-          # depending on the `inputs.parallism` we can more than one taskmanager so grab the first one
-          # TODO: loop through them all and dump the logs
-          kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log
-          cat /tmp/taskmanager.log
+          # depending on the `inputs.parallism` we can more than one taskmanager so only dump logs for the first three
+          iterations=$(expr "${{ github.event.inputs.parallelism }}" + 0)  # cast to integer
+          for (( i = 0; i < iterations; i++ )); do
+              kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log
+              cat /tmp/taskmanager.log
+              # break the loop if iteration count exceeds 3
+              if (( i >= 3 )); then
+                  break
+              fi
+          done
 
-          # delete the flinkdeployment so we don't have old failures hanging around
-          kubectl get flinkdeployment --no-headers | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
+          # NOTE: we actually want the failed flink deployments to sick around b/c we might want to inspect the flink dashboard
+          # kubectl get flinkdeployment --no-headers | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
 
           #################################################################
           # provide feedback about OOM errors where we've seen them before

From 8f49b281e428424128ac913c7a875e6e6bcc7437 Mon Sep 17 00:00:00 2001
From: ranchodeluxe <greg.corradini@gmail.com>
Date: Tue, 30 Jan 2024 06:54:07 -0800
Subject: [PATCH 2/2] better

---
 .github/workflows/job-runner.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml
index a0abb77..20f8180 100644
--- a/.github/workflows/job-runner.yaml
+++ b/.github/workflows/job-runner.yaml
@@ -252,10 +252,10 @@ jobs:
           kubectl get pod | grep -v taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} > /tmp/jobmanager.log
           cat /tmp/jobmanager.log
           echo "##################### TASK MANAGER ######################"
-          # depending on the `inputs.parallism` we can more than one taskmanager so only dump logs for the first three
+          # depending on the `inputs.parallism` we can have more than one taskmanager so only dump logs for the first three
           iterations=$(expr "${{ github.event.inputs.parallelism }}" + 0)  # cast to integer
-          for (( i = 0; i < iterations; i++ )); do
-              kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log
+          for (( i = 1; i < iterations; i++ )); do
+              kubectl get pod | grep ${{ needs.run-job.outputs.job_name }} | grep task-manager-$i | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log
               cat /tmp/taskmanager.log
               # break the loop if iteration count exceeds 3
               if (( i >= 3 )); then