Skip to content

Commit

Permalink
Merge pull request #31 from NASA-IMPACT/gcorradini/feature
Browse files Browse the repository at this point in the history
Feature: Keep Flink Dashboards for Failed Runners and Dump Task Manager Logs
  • Loading branch information
ranchodeluxe authored Jan 30, 2024
2 parents 27892c5 + 8f49b28 commit 577009f
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions .github/workflows/job-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,19 @@ jobs:
kubectl get pod | grep -v taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} > /tmp/jobmanager.log
cat /tmp/jobmanager.log
echo "##################### TASK MANAGER ######################"
# depending on the `inputs.parallism` we can more than one taskmanager so grab the first one
# TODO: loop through them all and dump the logs
kubectl get pod | grep taskmanager | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log
cat /tmp/taskmanager.log
# depending on the `inputs.parallism` we can have more than one taskmanager so only dump logs for the first three
iterations=$(expr "${{ github.event.inputs.parallelism }}" + 0) # cast to integer
for (( i = 1; i < iterations; i++ )); do
kubectl get pod | grep ${{ needs.run-job.outputs.job_name }} | grep task-manager-$i | cut -d' ' -f1 | head -n1 | xargs -I{} kubectl logs pod/{} -c flink-main-container > /tmp/taskmanager.log
cat /tmp/taskmanager.log
# break the loop if iteration count exceeds 3
if (( i >= 3 )); then
break
fi
done
# delete the flinkdeployment so we don't have old failures hanging around
kubectl get flinkdeployment --no-headers | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
# NOTE: we actually want the failed flink deployments to sick around b/c we might want to inspect the flink dashboard
# kubectl get flinkdeployment --no-headers | grep ${{ needs.run-job.outputs.job_name }} | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
#################################################################
# provide feedback about OOM errors where we've seen them before
Expand Down

0 comments on commit 577009f

Please sign in to comment.