From cf9961711d90569a2a20e009f20956902abc5397 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 6 Mar 2024 14:36:48 -0800 Subject: [PATCH] Refactor upload configs to match pathogen-repo-guide https://github.com/nextstrain/pathogen-repo-guide/tree/f825132e441a672586ec1748af4cdb149669f7c3/ingest/build-configs/nextstrain-automation --- .../nextstrain-automation/config.yaml | 37 ++++++++++--------- .../nextstrain-automation/upload.smk | 19 +++++----- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index 9f00a7e..2e819bb 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -1,22 +1,23 @@ -# Optional configs used by Nextstrain team -# Params for uploads -upload: - # Upload params for AWS S3 - s3: - # AWS S3 Bucket with prefix - dst: 's3://nextstrain-data/files/workflows/zika' - # Mapping of files to upload, with key as remote file name and the value - # the local file path relative to the ingest directory. - files_to_upload: - genbank.ndjson.xz: data/genbank.ndjson - all_sequences.ndjson.xz: data/sequences.ndjson - metadata.tsv.gz: results/metadata.tsv - sequences.fasta.xz: results/sequences.fasta - alignment.fasta.xz: data/alignment.fasta - insertions.csv.gz: data/insertions.csv - translations.zip: data/translations.zip +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run with additional Nextstrain automation rules. - cloudfront_domain: 'data.nextstrain.org' +# Custom rules to run as part of the Nextstrain automated workflow +# The paths should be relative to the ingest directory. +custom_rules: + - build-configs/nextstrain-automation/upload.smk + +# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads +# This is required as long as we are using the AWS CLI for uploads +cloudfront_domain: "data.nextstrain.org" + +# Nextstrain AWS S3 Bucket with pathogen prefix +# Replace with the pathogen repo name. +s3_dst: "s3://nextstrain-data/files/workflows/" + +files_to_upload: + genbank.ndjson.zst: data/genbank.ndjson + metadata.tsv.zst: results/metadata.tsv + sequences.fasta.zst: results/sequences.fasta # Toggle for Slack notifications send_slack_notifications: True diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk index 60c5c9b..1c82699 100644 --- a/ingest/build-configs/nextstrain-automation/upload.smk +++ b/ingest/build-configs/nextstrain-automation/upload.smk @@ -1,13 +1,14 @@ """ -This part of the workflow handles uploading files to a specified destination. +This part of the workflow handles uploading files to AWS S3. -Uses predefined wildcard `file_to_upload` determine input and predefined -wildcard `remote_file_name` as the remote file name in the specified destination. +Files to upload must be defined in the `files_to_upload` config param, where +the keys are the remote files and the values are the local filepaths +relative to the ingest directory. -Produces output files as `data/upload/{upload_target_name}/{remote_file_name}.done`. +Produces a single file for each uploaded file: + "results/upload/{remote_file}.upload" -Currently only supports uploads to AWS S3, but additional upload rules can -be easily added as long as they follow the output pattern described above. +The rule `upload_all` can be used as a target to upload all files. """ import os @@ -27,7 +28,7 @@ def _get_upload_inputs(wildcards): `send_notifications` is True. """ inputs = { - "file_to_upload": config["upload"]["s3"]["files_to_upload"][ + "file_to_upload": config["files_to_upload"][ wildcards.remote_file_name ], } @@ -52,8 +53,8 @@ rule upload_to_s3: "data/upload/s3/{remote_file_name}.done", params: quiet="" if send_notifications else "--quiet", - s3_dst=config["upload"].get("s3", {}).get("dst", ""), - cloudfront_domain=config["upload"].get("s3", {}).get("cloudfront_domain", ""), + s3_dst=config["s3_dst"], + cloudfront_domain=config["cloudfront_domain"], shell: """ ./vendored/upload-to-s3 \