Skip to content

Commit

Permalink
wip! pathogen-repo-build: Wait for AWS Batch jobs to finish
Browse files Browse the repository at this point in the history
  • Loading branch information
tsibley committed Sep 13, 2023
1 parent 6e0c0d9 commit 0b0cf26
Show file tree
Hide file tree
Showing 2 changed files with 479 additions and 7 deletions.
345 changes: 341 additions & 4 deletions .github/workflows/pathogen-repo-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,10 @@ jobs:
uses: actions/checkout@v3
with:
repository: ${{ inputs.repo }}
# Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
- name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
- # Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
uses: actions/checkout@v3
with:
repository: ${{ needs.workflow-context.outputs.repository }}
Expand Down Expand Up @@ -289,3 +289,340 @@ jobs:
logs/
.snakemake/log/
${{ inputs.artifact-paths }}
outputs:
AWS_BATCH_JOB_ID: ${{ env.AWS_BATCH_JOB_ID }}
# Wait for up to 6 hours (the maximum/default GitHub Actions job timeout¹)
# for the AWS Batch job to finish.
#
# ¹ <https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration#usage-limits>
wait-1:
# XXX FIXME: drop this
timeout-minutes: 1
needs: [run-build, workflow-context]
if: needs.run-build.outputs.AWS_BATCH_JOB_ID
runs-on: ubuntu-latest
steps:
# Uses needs.workflow-context.outputs
- # Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
uses: actions/checkout@v3
with:
repository: ${{ needs.workflow-context.outputs.repository }}
ref: ${{ needs.workflow-context.outputs.sha }}
path: ${{ env.NEXTSTRAIN_GITHUB_DIR }}
- if: inputs.runtime == 'aws-batch'
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }}
# XXX TODO: <https://github.com/aws-actions/configure-aws-credentials#credential-lifetime>
- name: Setup runtime ${{ inputs.runtime }}
uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli
with:
cli-version: ">=7.1.0"
runtime: ${{ inputs.runtime }}
- id: attach
name: Attach to AWS Batch job
env:
AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }}
run: |
# See <https://docs.github.com/en/actions/managing-workflow-runs/canceling-a-workflow#steps-github-takes-to-cancel-a-workflow-run>
interrupt() {
echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2
# XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release
kill -SIGTSTP %
wait %
exit 0
}
trap interrupt SIGINT
nextstrain build \
--aws-batch \
--attach "$AWS_BATCH_JOB_ID" \
--no-download \
. \
&
wait %
# Allow the workflow to be considered successful even if this job errors
# due to cancellation (timing out). Unfortunately, this doesn't
# distinguish between error from cancellation and error from command
# failure, so we work around that below.
continue-on-error: true
# Emit a "conclusion" output for the job that's based on the built-in
# conclusion (success, failure, cancelled) of the "attach" step above.
# This is the conclusion we care about for the job since the job's own
# "conclusion" is masked/transformed by "continue-on-error: true" above.
outputs:
attach-step-conclusion: ${{ steps.attach.conclusion }}
# Wait for up to another 6 hours (hours 6–12) if the preceding wait-N job
# timed out while attached to the AWS Batch job.
wait-2:
# XXX FIXME: drop this
timeout-minutes: 1
runs-on: ubuntu-latest
steps:
# Uses needs.workflow-context.outputs
- # Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
uses: actions/checkout@v3
with:
repository: ${{ needs.workflow-context.outputs.repository }}
ref: ${{ needs.workflow-context.outputs.sha }}
path: ${{ env.NEXTSTRAIN_GITHUB_DIR }}
- if: inputs.runtime == 'aws-batch'
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }}
# XXX TODO: <https://github.com/aws-actions/configure-aws-credentials#credential-lifetime>
- name: Setup runtime ${{ inputs.runtime }}
uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli
with:
cli-version: ">=7.1.0"
runtime: ${{ inputs.runtime }}
- id: attach
name: Attach to AWS Batch job
env:
AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }}
run: |
# See <https://docs.github.com/en/actions/managing-workflow-runs/canceling-a-workflow#steps-github-takes-to-cancel-a-workflow-run>
interrupt() {
echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2
# XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release
kill -SIGTSTP %
wait %
exit 0
}
trap interrupt SIGINT
nextstrain build \
--aws-batch \
--attach "$AWS_BATCH_JOB_ID" \
--no-download \
. \
&
wait %
# Allow the workflow to be considered successful even if this job errors
# due to cancellation (timing out). Unfortunately, this doesn't
# distinguish between error from cancellation and error from command
# failure, so we work around that below.
continue-on-error: true
# Emit a "conclusion" output for the job that's based on the built-in
# conclusion (success, failure, cancelled) of the "attach" step above.
# This is the conclusion we care about for the job since the job's own
# "conclusion" is masked/transformed by "continue-on-error: true" above.
outputs:
attach-step-conclusion: ${{ steps.attach.conclusion }}
needs: [wait-1, run-build, workflow-context]
if: needs.wait-1.outputs.attach-step-conclusion == 'cancelled'
# 12–18 hours
wait-3:
# XXX FIXME: drop this
timeout-minutes: 1
runs-on: ubuntu-latest
steps:
# Uses needs.workflow-context.outputs
- # Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
uses: actions/checkout@v3
with:
repository: ${{ needs.workflow-context.outputs.repository }}
ref: ${{ needs.workflow-context.outputs.sha }}
path: ${{ env.NEXTSTRAIN_GITHUB_DIR }}
- if: inputs.runtime == 'aws-batch'
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }}
# XXX TODO: <https://github.com/aws-actions/configure-aws-credentials#credential-lifetime>
- name: Setup runtime ${{ inputs.runtime }}
uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli
with:
cli-version: ">=7.1.0"
runtime: ${{ inputs.runtime }}
- id: attach
name: Attach to AWS Batch job
env:
AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }}
run: |
# See <https://docs.github.com/en/actions/managing-workflow-runs/canceling-a-workflow#steps-github-takes-to-cancel-a-workflow-run>
interrupt() {
echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2
# XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release
kill -SIGTSTP %
wait %
exit 0
}
trap interrupt SIGINT
nextstrain build \
--aws-batch \
--attach "$AWS_BATCH_JOB_ID" \
--no-download \
. \
&
wait %
# Allow the workflow to be considered successful even if this job errors
# due to cancellation (timing out). Unfortunately, this doesn't
# distinguish between error from cancellation and error from command
# failure, so we work around that below.
continue-on-error: true
# Emit a "conclusion" output for the job that's based on the built-in
# conclusion (success, failure, cancelled) of the "attach" step above.
# This is the conclusion we care about for the job since the job's own
# "conclusion" is masked/transformed by "continue-on-error: true" above.
outputs:
attach-step-conclusion: ${{ steps.attach.conclusion }}
needs: [wait-2, run-build, workflow-context]
if: needs.wait-2.outputs.attach-step-conclusion == 'cancelled'
# 18–24 hours
wait-4:
# XXX FIXME: drop this
timeout-minutes: 1
runs-on: ubuntu-latest
steps:
# Uses needs.workflow-context.outputs
- # Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
uses: actions/checkout@v3
with:
repository: ${{ needs.workflow-context.outputs.repository }}
ref: ${{ needs.workflow-context.outputs.sha }}
path: ${{ env.NEXTSTRAIN_GITHUB_DIR }}
- if: inputs.runtime == 'aws-batch'
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }}
# XXX TODO: <https://github.com/aws-actions/configure-aws-credentials#credential-lifetime>
- name: Setup runtime ${{ inputs.runtime }}
uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli
with:
cli-version: ">=7.1.0"
runtime: ${{ inputs.runtime }}
- id: attach
name: Attach to AWS Batch job
env:
AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }}
run: |
# See <https://docs.github.com/en/actions/managing-workflow-runs/canceling-a-workflow#steps-github-takes-to-cancel-a-workflow-run>
interrupt() {
echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2
# XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release
kill -SIGTSTP %
wait %
exit 0
}
trap interrupt SIGINT
nextstrain build \
--aws-batch \
--attach "$AWS_BATCH_JOB_ID" \
--no-download \
. \
&
wait %
# Allow the workflow to be considered successful even if this job errors
# due to cancellation (timing out). Unfortunately, this doesn't
# distinguish between error from cancellation and error from command
# failure, so we work around that below.
continue-on-error: true
# Emit a "conclusion" output for the job that's based on the built-in
# conclusion (success, failure, cancelled) of the "attach" step above.
# This is the conclusion we care about for the job since the job's own
# "conclusion" is masked/transformed by "continue-on-error: true" above.
outputs:
attach-step-conclusion: ${{ steps.attach.conclusion }}
needs: [wait-3, run-build, workflow-context]
if: needs.wait-3.outputs.attach-step-conclusion == 'cancelled'
# Since the wait-N jobs use "continue-on-error: true" out of necessity (to
# avoid failing the whole workflow when they time out and get cancelled), we
# use a final job here to succeed or fail the whole workflow based on the
# aggregate of their "attach" step conclusions.
conclusion:
needs: [wait-1, wait-2, wait-3, wait-4]
if: always()
runs-on: ubuntu-latest
steps:
- name: All attach steps in wait-N jobs were successful (or skipped)
run: exit ${{ contains(needs.*.outputs.attach-step-conclusion, 'failure') && '1' || '0' }}
# XXX TODO: Jobs can fall off the end of our wait-N chain and appear to be
# successful/complete in GitHub but still running on AWS. Probably very
# rare in reality, though, for an AWS job to take longer than 24h?
# -trs, 12 Sept 2023
# Cancel the AWS Batch job if the GitHub workflow run is cancelled.
cancellation:
needs: [wait-4, run-build, workflow-context]
if: cancelled()
runs-on: ubuntu-latest
steps:
# Uses needs.workflow-context.outputs
- # Need to run this after the build repo is cloned so that cloning the
# build repo does not overwrite the .git dir and remove the extra support files
# that we need from nextstrain/.github repo
name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }})
uses: actions/checkout@v3
with:
repository: ${{ needs.workflow-context.outputs.repository }}
ref: ${{ needs.workflow-context.outputs.sha }}
path: ${{ env.NEXTSTRAIN_GITHUB_DIR }}
- if: inputs.runtime == 'aws-batch'
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }}
# XXX TODO: <https://github.com/aws-actions/configure-aws-credentials#credential-lifetime>
- name: Setup runtime ${{ inputs.runtime }}
uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli
with:
cli-version: ">=7.1.0"
runtime: ${{ inputs.runtime }}
- id: cancel
name: Cancel AWS Batch job
env:
AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }}
run: |
# XXX FIXME: instead of signaling here, use --cancel in new release
set -x
nextstrain build \
--aws-batch \
--attach "$AWS_BATCH_JOB_ID" \
--no-download \
. \
&
# `nextstrain` will cancel the AWS Batch job upon receiving SIGINT
# and stay attached while it waits for cancellation to occur, before
# finally exiting non-zero. In the unlikely event that the job
# completes before cancellation can occur, it'll exit 0, and we want
# to treat that as an error.
sleep 5
kill -SIGINT %
sleep 1
kill -SIGINT %
wait % && exit 1 || exit 0
# The cancellation job may fail, but we don't want that to impact the
# overall workflow run status.
continue-on-error: true
Loading

0 comments on commit 0b0cf26

Please sign in to comment.