From c7b03e92a5454e63ee1b563b541f0268c64f3d01 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Tue, 30 Jan 2024 06:43:27 -0800 Subject: [PATCH 1/2] leave failed runners around --- .github/workflows/job-runner.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 2877a83..a0abb77 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -252,13 +252,19 @@ jobs: kubectl get pod | grep -v taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} > /tmp/jobmanager.log cat /tmp/jobmanager.log echo "##################### TASK MANAGER ######################" - # depending on the `inputs.parallism` we can more than one taskmanager so grab the first one - # TODO: loop through them all and dump the logs - kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log - cat /tmp/taskmanager.log + # depending on the `inputs.parallism` we can more than one taskmanager so only dump logs for the first three + iterations=$(expr "${{ github.event.inputs.parallelism }}" + 0) # cast to integer + for (( i = 0; i < iterations; i++ )); do + kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log + cat /tmp/taskmanager.log + # break the loop if iteration count exceeds 3 + if (( i >= 3 )); then + break + fi + done - # delete the flinkdeployment so we don't have old failures hanging around - kubectl get flinkdeployment --no-headers | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{} + # NOTE: we actually want the failed flink deployments to sick around b/c we might want to inspect the flink dashboard + # kubectl get flinkdeployment --no-headers | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{} ################################################################# # provide feedback about OOM errors where we've seen them before From 8f49b281e428424128ac913c7a875e6e6bcc7437 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Tue, 30 Jan 2024 06:54:07 -0800 Subject: [PATCH 2/2] better --- .github/workflows/job-runner.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index a0abb77..20f8180 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -252,10 +252,10 @@ jobs: kubectl get pod | grep -v taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} > /tmp/jobmanager.log cat /tmp/jobmanager.log echo "##################### TASK MANAGER ######################" - # depending on the `inputs.parallism` we can more than one taskmanager so only dump logs for the first three + # depending on the `inputs.parallism` we can have more than one taskmanager so only dump logs for the first three iterations=$(expr "${{ github.event.inputs.parallelism }}" + 0) # cast to integer - for (( i = 0; i < iterations; i++ )); do - kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log + for (( i = 1; i < iterations; i++ )); do + kubectl get pod | grep ${{ needs.run-job.outputs.job_name }} | grep task-manager-$i | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log cat /tmp/taskmanager.log # break the loop if iteration count exceeds 3 if (( i >= 3 )); then