From 0b0cf269797a03c963037af9670a117bd570f959 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 12 Sep 2023 15:42:18 -0700 Subject: [PATCH] wip! pathogen-repo-build: Wait for AWS Batch jobs to finish --- .github/workflows/pathogen-repo-build.yaml | 345 +++++++++++++++++- .github/workflows/pathogen-repo-build.yaml.in | 141 ++++++- 2 files changed, 479 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pathogen-repo-build.yaml b/.github/workflows/pathogen-repo-build.yaml index cd9ada1..b72708b 100644 --- a/.github/workflows/pathogen-repo-build.yaml +++ b/.github/workflows/pathogen-repo-build.yaml @@ -219,10 +219,10 @@ jobs: uses: actions/checkout@v3 with: repository: ${{ inputs.repo }} - # Need to run this after the build repo is cloned so that cloning the - # build repo does not overwrite the .git dir and remove the extra support files - # that we need from nextstrain/.github repo - - name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + - # Need to run this after the build repo is cloned so that cloning the + # build repo does not overwrite the .git dir and remove the extra support files + # that we need from nextstrain/.github repo + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) uses: actions/checkout@v3 with: repository: ${{ needs.workflow-context.outputs.repository }} @@ -289,3 +289,340 @@ jobs: logs/ .snakemake/log/ ${{ inputs.artifact-paths }} + outputs: + AWS_BATCH_JOB_ID: ${{ env.AWS_BATCH_JOB_ID }} + # Wait for up to 6 hours (the maximum/default GitHub Actions job timeout¹) + # for the AWS Batch job to finish. + # + # ¹ + wait-1: + # XXX FIXME: drop this + timeout-minutes: 1 + needs: [run-build, workflow-context] + if: needs.run-build.outputs.AWS_BATCH_JOB_ID + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - # Need to run this after the build repo is cloned so that cloning the + # build repo does not overwrite the .git dir and remove the extra support files + # that we need from nextstrain/.github repo + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + uses: actions/checkout@v3 + with: + repository: ${{ needs.workflow-context.outputs.repository }} + ref: ${{ needs.workflow-context.outputs.sha }} + path: ${{ env.NEXTSTRAIN_GITHUB_DIR }} + - if: inputs.runtime == 'aws-batch' + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-east-1 + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }} + # XXX TODO: + - name: Setup runtime ${{ inputs.runtime }} + uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli + with: + cli-version: ">=7.1.0" + runtime: ${{ inputs.runtime }} + - id: attach + name: Attach to AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # See + interrupt() { + echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2 + # XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release + kill -SIGTSTP % + wait % + exit 0 + } + trap interrupt SIGINT + + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + wait % + # Allow the workflow to be considered successful even if this job errors + # due to cancellation (timing out). Unfortunately, this doesn't + # distinguish between error from cancellation and error from command + # failure, so we work around that below. + continue-on-error: true + # Emit a "conclusion" output for the job that's based on the built-in + # conclusion (success, failure, cancelled) of the "attach" step above. + # This is the conclusion we care about for the job since the job's own + # "conclusion" is masked/transformed by "continue-on-error: true" above. + outputs: + attach-step-conclusion: ${{ steps.attach.conclusion }} + # Wait for up to another 6 hours (hours 6–12) if the preceding wait-N job + # timed out while attached to the AWS Batch job. + wait-2: + # XXX FIXME: drop this + timeout-minutes: 1 + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - # Need to run this after the build repo is cloned so that cloning the + # build repo does not overwrite the .git dir and remove the extra support files + # that we need from nextstrain/.github repo + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + uses: actions/checkout@v3 + with: + repository: ${{ needs.workflow-context.outputs.repository }} + ref: ${{ needs.workflow-context.outputs.sha }} + path: ${{ env.NEXTSTRAIN_GITHUB_DIR }} + - if: inputs.runtime == 'aws-batch' + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-east-1 + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }} + # XXX TODO: + - name: Setup runtime ${{ inputs.runtime }} + uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli + with: + cli-version: ">=7.1.0" + runtime: ${{ inputs.runtime }} + - id: attach + name: Attach to AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # See + interrupt() { + echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2 + # XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release + kill -SIGTSTP % + wait % + exit 0 + } + trap interrupt SIGINT + + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + wait % + # Allow the workflow to be considered successful even if this job errors + # due to cancellation (timing out). Unfortunately, this doesn't + # distinguish between error from cancellation and error from command + # failure, so we work around that below. + continue-on-error: true + # Emit a "conclusion" output for the job that's based on the built-in + # conclusion (success, failure, cancelled) of the "attach" step above. + # This is the conclusion we care about for the job since the job's own + # "conclusion" is masked/transformed by "continue-on-error: true" above. + outputs: + attach-step-conclusion: ${{ steps.attach.conclusion }} + needs: [wait-1, run-build, workflow-context] + if: needs.wait-1.outputs.attach-step-conclusion == 'cancelled' + # 12–18 hours + wait-3: + # XXX FIXME: drop this + timeout-minutes: 1 + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - # Need to run this after the build repo is cloned so that cloning the + # build repo does not overwrite the .git dir and remove the extra support files + # that we need from nextstrain/.github repo + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + uses: actions/checkout@v3 + with: + repository: ${{ needs.workflow-context.outputs.repository }} + ref: ${{ needs.workflow-context.outputs.sha }} + path: ${{ env.NEXTSTRAIN_GITHUB_DIR }} + - if: inputs.runtime == 'aws-batch' + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-east-1 + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }} + # XXX TODO: + - name: Setup runtime ${{ inputs.runtime }} + uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli + with: + cli-version: ">=7.1.0" + runtime: ${{ inputs.runtime }} + - id: attach + name: Attach to AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # See + interrupt() { + echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2 + # XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release + kill -SIGTSTP % + wait % + exit 0 + } + trap interrupt SIGINT + + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + wait % + # Allow the workflow to be considered successful even if this job errors + # due to cancellation (timing out). Unfortunately, this doesn't + # distinguish between error from cancellation and error from command + # failure, so we work around that below. + continue-on-error: true + # Emit a "conclusion" output for the job that's based on the built-in + # conclusion (success, failure, cancelled) of the "attach" step above. + # This is the conclusion we care about for the job since the job's own + # "conclusion" is masked/transformed by "continue-on-error: true" above. + outputs: + attach-step-conclusion: ${{ steps.attach.conclusion }} + needs: [wait-2, run-build, workflow-context] + if: needs.wait-2.outputs.attach-step-conclusion == 'cancelled' + # 18–24 hours + wait-4: + # XXX FIXME: drop this + timeout-minutes: 1 + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - # Need to run this after the build repo is cloned so that cloning the + # build repo does not overwrite the .git dir and remove the extra support files + # that we need from nextstrain/.github repo + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + uses: actions/checkout@v3 + with: + repository: ${{ needs.workflow-context.outputs.repository }} + ref: ${{ needs.workflow-context.outputs.sha }} + path: ${{ env.NEXTSTRAIN_GITHUB_DIR }} + - if: inputs.runtime == 'aws-batch' + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-east-1 + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }} + # XXX TODO: + - name: Setup runtime ${{ inputs.runtime }} + uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli + with: + cli-version: ">=7.1.0" + runtime: ${{ inputs.runtime }} + - id: attach + name: Attach to AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # See + interrupt() { + echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2 + # XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release + kill -SIGTSTP % + wait % + exit 0 + } + trap interrupt SIGINT + + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + wait % + # Allow the workflow to be considered successful even if this job errors + # due to cancellation (timing out). Unfortunately, this doesn't + # distinguish between error from cancellation and error from command + # failure, so we work around that below. + continue-on-error: true + # Emit a "conclusion" output for the job that's based on the built-in + # conclusion (success, failure, cancelled) of the "attach" step above. + # This is the conclusion we care about for the job since the job's own + # "conclusion" is masked/transformed by "continue-on-error: true" above. + outputs: + attach-step-conclusion: ${{ steps.attach.conclusion }} + needs: [wait-3, run-build, workflow-context] + if: needs.wait-3.outputs.attach-step-conclusion == 'cancelled' + # Since the wait-N jobs use "continue-on-error: true" out of necessity (to + # avoid failing the whole workflow when they time out and get cancelled), we + # use a final job here to succeed or fail the whole workflow based on the + # aggregate of their "attach" step conclusions. + conclusion: + needs: [wait-1, wait-2, wait-3, wait-4] + if: always() + runs-on: ubuntu-latest + steps: + - name: All attach steps in wait-N jobs were successful (or skipped) + run: exit ${{ contains(needs.*.outputs.attach-step-conclusion, 'failure') && '1' || '0' }} + # XXX TODO: Jobs can fall off the end of our wait-N chain and appear to be + # successful/complete in GitHub but still running on AWS. Probably very + # rare in reality, though, for an AWS job to take longer than 24h? + # -trs, 12 Sept 2023 + # Cancel the AWS Batch job if the GitHub workflow run is cancelled. + cancellation: + needs: [wait-4, run-build, workflow-context] + if: cancelled() + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - # Need to run this after the build repo is cloned so that cloning the + # build repo does not overwrite the .git dir and remove the extra support files + # that we need from nextstrain/.github repo + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + uses: actions/checkout@v3 + with: + repository: ${{ needs.workflow-context.outputs.repository }} + ref: ${{ needs.workflow-context.outputs.sha }} + path: ${{ env.NEXTSTRAIN_GITHUB_DIR }} + - if: inputs.runtime == 'aws-batch' + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-east-1 + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }} + # XXX TODO: + - name: Setup runtime ${{ inputs.runtime }} + uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli + with: + cli-version: ">=7.1.0" + runtime: ${{ inputs.runtime }} + - id: cancel + name: Cancel AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # XXX FIXME: instead of signaling here, use --cancel in new release + set -x + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + # `nextstrain` will cancel the AWS Batch job upon receiving SIGINT + # and stay attached while it waits for cancellation to occur, before + # finally exiting non-zero. In the unlikely event that the job + # completes before cancellation can occur, it'll exit 0, and we want + # to treat that as an error. + sleep 5 + kill -SIGINT % + sleep 1 + kill -SIGINT % + wait % && exit 1 || exit 0 + # The cancellation job may fail, but we don't want that to impact the + # overall workflow run status. + continue-on-error: true diff --git a/.github/workflows/pathogen-repo-build.yaml.in b/.github/workflows/pathogen-repo-build.yaml.in index fc7c6b9..cc3d3ac 100644 --- a/.github/workflows/pathogen-repo-build.yaml.in +++ b/.github/workflows/pathogen-repo-build.yaml.in @@ -178,7 +178,8 @@ jobs: # Need to run this after the build repo is cloned so that cloning the # build repo does not overwrite the .git dir and remove the extra support files # that we need from nextstrain/.github repo - - name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) + - &checkout-workflow-support + name: Checkout ${{ needs.workflow-context.outputs.repository }} (sha ${{ needs.workflow-context.outputs.sha }}) uses: actions/checkout@v3 with: repository: ${{ needs.workflow-context.outputs.repository }} @@ -207,7 +208,8 @@ jobs: | "$NEXTSTRAIN_GITHUB_DIR"/bin/json-to-envvars | tee -a "$GITHUB_ENV" - - if: inputs.runtime == 'aws-batch' + - &setup-aws-credentials + if: inputs.runtime == 'aws-batch' uses: aws-actions/configure-aws-credentials@v2 with: aws-region: us-east-1 @@ -216,7 +218,8 @@ jobs: role-to-assume: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'arn:aws:iam::827581582529:role/GitHubActionsRoleNextstrainBatchJobs' || '' }} # XXX TODO: - - name: Setup runtime ${{ inputs.runtime }} + - &setup-runtime + name: Setup runtime ${{ inputs.runtime }} uses: ./.git/nextstrain/.github/actions/setup-nextstrain-cli with: cli-version: ">=7.1.0" @@ -256,3 +259,135 @@ jobs: logs/ .snakemake/log/ ${{ inputs.artifact-paths }} + + outputs: + AWS_BATCH_JOB_ID: ${{ env.AWS_BATCH_JOB_ID }} + + # Wait for up to 6 hours (the maximum/default GitHub Actions job timeout¹) + # for the AWS Batch job to finish. + # + # ¹ + wait-1: &wait + # XXX FIXME: drop this + timeout-minutes: 1 + needs: [run-build, workflow-context] + if: needs.run-build.outputs.AWS_BATCH_JOB_ID + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - *checkout-workflow-support + - *setup-aws-credentials + - *setup-runtime + + - id: attach + name: Attach to AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # See + interrupt() { + echo "Trapped SIGINT (job timed out/cancelled); detaching" >&2 + # XXX FIXME: instead of converting SIGINT → SIGTSTP here, use --detach-on-interrupt in new release + kill -SIGTSTP % + wait % + exit 0 + } + trap interrupt SIGINT + + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + wait % + + # Allow the workflow to be considered successful even if this job errors + # due to cancellation (timing out). Unfortunately, this doesn't + # distinguish between error from cancellation and error from command + # failure, so we work around that below. + continue-on-error: true + + # Emit a "conclusion" output for the job that's based on the built-in + # conclusion (success, failure, cancelled) of the "attach" step above. + # This is the conclusion we care about for the job since the job's own + # "conclusion" is masked/transformed by "continue-on-error: true" above. + outputs: + attach-step-conclusion: ${{ steps.attach.conclusion }} + + # Wait for up to another 6 hours (hours 6–12) if the preceding wait-N job + # timed out while attached to the AWS Batch job. + wait-2: + <<: *wait + needs: [wait-1, run-build, workflow-context] + if: needs.wait-1.outputs.attach-step-conclusion == 'cancelled' + + # 12–18 hours + wait-3: + <<: *wait + needs: [wait-2, run-build, workflow-context] + if: needs.wait-2.outputs.attach-step-conclusion == 'cancelled' + + # 18–24 hours + wait-4: + <<: *wait + needs: [wait-3, run-build, workflow-context] + if: needs.wait-3.outputs.attach-step-conclusion == 'cancelled' + + # Since the wait-N jobs use "continue-on-error: true" out of necessity (to + # avoid failing the whole workflow when they time out and get cancelled), we + # use a final job here to succeed or fail the whole workflow based on the + # aggregate of their "attach" step conclusions. + conclusion: + needs: [wait-1, wait-2, wait-3, wait-4] + if: always() + runs-on: ubuntu-latest + steps: + - name: All attach steps in wait-N jobs were successful (or skipped) + run: exit ${{ contains(needs.*.outputs.attach-step-conclusion, 'failure') && '1' || '0' }} + + # XXX TODO: Jobs can fall off the end of our wait-N chain and appear to be + # successful/complete in GitHub but still running on AWS. Probably very + # rare in reality, though, for an AWS job to take longer than 24h? + # -trs, 12 Sept 2023 + + # Cancel the AWS Batch job if the GitHub workflow run is cancelled. + cancellation: + needs: [wait-4, run-build, workflow-context] + if: cancelled() + runs-on: ubuntu-latest + steps: + # Uses needs.workflow-context.outputs + - *checkout-workflow-support + - *setup-aws-credentials + - *setup-runtime + + - id: cancel + name: Cancel AWS Batch job + env: + AWS_BATCH_JOB_ID: ${{ needs.run-build.outputs.AWS_BATCH_JOB_ID }} + run: | + # XXX FIXME: instead of signaling here, use --cancel in new release + set -x + nextstrain build \ + --aws-batch \ + --attach "$AWS_BATCH_JOB_ID" \ + --no-download \ + . \ + & + + # `nextstrain` will cancel the AWS Batch job upon receiving SIGINT + # and stay attached while it waits for cancellation to occur, before + # finally exiting non-zero. In the unlikely event that the job + # completes before cancellation can occur, it'll exit 0, and we want + # to treat that as an error. + sleep 5 + kill -SIGINT % + sleep 1 + kill -SIGINT % + wait % && exit 1 || exit 0 + + # The cancellation job may fail, but we don't want that to impact the + # overall workflow run status. + continue-on-error: true