From 3b45c7ec26c210b2cae180bfadadc620d93b387a Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Fri, 27 Oct 2023 15:48:08 -0700
Subject: [PATCH] Move ncbi taxon id to config

---
 ingest/config/config.yaml                           | 2 ++
 ingest/workflow/snakemake_rules/fetch_sequences.smk | 4 +++-
 phylogenetic/config/description.md                  | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index a8e26f70..eaa067a0 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -1,5 +1,7 @@
 # Sources of sequences to include in the ingest run
 sources: ['genbank']
+# Pathogen NCBI Taxonomy ID
+ncbi_taxon_id: '10244'
 
 # Params for the transform rule
 transform:
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
index b1c81cf4..b8411801 100644
--- a/ingest/workflow/snakemake_rules/fetch_sequences.smk
+++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -20,9 +20,11 @@ rule fetch_ncbi_dataset_package:
     retries: 5  # Requires snakemake 7.7.0 or later
     benchmark:
         "benchmarks/fetch_ncbi_dataset_package.txt"
+    params:
+        ncbi_taxon_id=config["ncbi_taxon_id"],
     shell:
         """
-        datasets download virus genome taxon 10244 \
+        datasets download virus genome taxon {params.ncbi_taxon_id} \
             --no-progressbar \
             --filename {output.dataset_package}
         """
diff --git a/phylogenetic/config/description.md b/phylogenetic/config/description.md
index c4182b33..99a8fb59 100644
--- a/phylogenetic/config/description.md
+++ b/phylogenetic/config/description.md
@@ -18,7 +18,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mon
 
 #### Underlying data
 We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/),
-using NCBI Taxonomy ID "10244", as starting point for these analyses.
+using an NCBI Taxonomy ID defined in [ingest/config/config.yaml](https://github.com/nextstrain/monkeypox/blob/master/ingest/config/config.yaml), as starting point for these analyses.
 Curated sequences and metadata are available as flat files at:
 - [data.nextstrain.org/files/workflows/monkeypox/sequences.fasta.xz](https://data.nextstrain.org/files/workflows/monkeypox/sequences.fasta.xz)
 - [data.nextstrain.org/files/workflows/monkeypox/metadata.tsv.gz](https://data.nextstrain.org/files/workflows/monkeypox/metadata.tsv.gz)