nextstrain · corneliusroemer · Feb 6, 2023 · tsibley · Feb 13, 2023 · tsibley
diff --git a/README.md b/README.md
@@ -135,17 +135,21 @@ Whenever the underlying nextclade dataset (reference tree, QC rules) and/or next
 In order to tell ingest to not use the cached `nextclade.tsv`/`aligned.fasta` and instead perform a full rerun, you need to add an (empty) touchfile to the s3 bucket:
 
 ```bash
-aws s3 cp - s3://nextstrain-ncov-private/nextclade.tsv.zst.renew < /dev/null
-aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade.tsv.zst.renew < /dev/null
+for file in [ "nextclade.tsv.zst.renew", "version_sars-cov-2.txt" ]; do
+  aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null
+  aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null
+done
-for file in [ "nextclade.tsv.zst.renew", "version_sars-cov-2.txt" ]; do
-  aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null
-  aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null
-done
+for file in nextclade.tsv.zst.renew version_sars-cov-2.txt; do
+  aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null
+  aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null
+done
-for file in [ "nextclade.tsv.zst.renew", "version_sars-cov-2.txt" ]; do
-  aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null
-  aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null
-done
+for file in nextclade.tsv.zst.renew version_sars-cov-2.txt; do
+  aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null
+  aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null
+done
 ```
 
 Ingest will automatically remove the touchfiles after it has completed the rerun.
 
 To rerun Nextclade using the `sars-cov-2-21L` dataset - which is only necessary when the calculation of `immune_escape` and `ace2_binding` changes - you need to add an (empty) touchfile to the s3 bucket:
 
 ```bash
-aws s3 cp - s3://nextstrain-ncov-private/nextclade_21L.tsv.zst.renew < /dev/null
-aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade_21L.tsv.zst.renew < /dev/null
+for file in [ "nextclade_21L.tsv.zst.renew", "version_sars-cov-2-21L.txt" ]; do
+  aws s3 cp - s3://nextstrain-ncov-private/$file < /dev/null
+  aws s3 cp - s3://nextstrain-data/files/ncov/open/$file < /dev/null
+done
 ```
 
 ## Required environment variables
@@ -157,3 +161,7 @@ aws s3 cp - s3://nextstrain-data/files/ncov/open/nextclade_21L.tsv.zst.renew < /
 - `AWS_SECRET_ACCESS_KEY`
 - `SLACK_TOKEN`
 - `SLACK_CHANNELS`
+
+## Unstable files produced by workflow
+
+- `version_sars-cov-2.txt` and `version_sars-cov-2-21L.txt`: used to track the version of the nextclade dataset used to generate the `nextclade.tsv` and `nextclade_21L.tsv` files. Format: `timestamp dataset_version` (e.g. `2023-02-06T14:40:23Z 2023-02-01T12:00:00Z`) for each run since and including the last full run.
diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk
@@ -138,13 +138,26 @@ rule download_nextclade_executable:
         """
 
 rule download_nextclade_dataset:
-    """Download Nextclade dataset"""
+    """
+    Download Nextclade dataset
+    Append the dataset version used for this run to the version file with timestamp of download time 
+    """
     input: "nextclade"
     output:
-        dataset = "data/nextclade_data/{dataset_name}.zip"
+        dataset = "data/nextclade_data/{dataset_name}.zip",
+        version = "data/nextclade_data/version_{dataset_name}.txt",
+    params:
+        dst_version_file=config["s3_dst"] + "/version_{dataset_name}.txt",
+        src_version_file=config["s3_src"] + "/version_{dataset_name}.txt",
     shell:
         """
+        ./bin/download-from-s3 {params.dst_version_file} {output.version} 0 ||  \
+        ./bin/download-from-s3 {params.src_version_file} {output.version} 0 ||  \
+        touch {output.version}
+
         ./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
+        printf %s "$(date --utc +%FT%TZ) " >> {output.version}
+        nextclade dataset list --name="{wildcards.dataset_name}" --json | jq -r '.[0].attributes.tag.value' >>{output.version}
         """
 
 GENES = "E,M,N,ORF1a,ORF1b,ORF3a,ORF6,ORF7a,ORF7b,ORF8,ORF9b,S"

diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk
@@ -105,6 +105,29 @@ rule remove_rerun_touchfile:
         touch {output}
         """
 
+rule upload_dataset_version:
+    """
+    Upload the Nextclade dataset version file
+    """
+    input: 
+        metadata_upload = f"data/{database}/metadata.tsv.zst.upload",
+        version_file = "data/nextclade_data/version_{dataset_name}.txt"
+    output:
+        touch("data/nextclade_data/version_{dataset_name}.upload")
+    params:
+        quiet = "" if send_notifications else "--quiet",
+        s3_bucket = config.get("s3_dst",""),
+        cloudfront_domain = config.get("cloudfront_domain", ""),
+        remote_filename = "version_{dataset_name}.txt",
+    shell:
+        """
+        ./bin/upload-to-s3 \
+            {params.quiet} \
+            {input.version_file:q} \
+            {params.s3_bucket:q}/{params.remote_filename:q} \
+            {params.cloudfront_domain} 2>&1 | tee {output}
+        """
+
 rule upload:
     """
     Requests one touch file for each uploaded remote file
@@ -117,6 +140,7 @@ rule upload:
                 "nextclade.tsv.zst",
                 "nextclade_21L.tsv.zst",
             ]
-        ]
+        ],
+        dataset_version = [f"data/nextclade_data/version_{dataset_name}.upload" for dataset_name in ["sars-cov-2", "sars-cov-2-21L"]],
     output:
         touch(f"data/{database}/upload.done")