From 3b45c7ec26c210b2cae180bfadadc620d93b387a Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 27 Oct 2023 15:48:08 -0700 Subject: [PATCH] Move ncbi taxon id to config --- ingest/config/config.yaml | 2 ++ ingest/workflow/snakemake_rules/fetch_sequences.smk | 4 +++- phylogenetic/config/description.md | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index a8e26f70..eaa067a0 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -1,5 +1,7 @@ # Sources of sequences to include in the ingest run sources: ['genbank'] +# Pathogen NCBI Taxonomy ID +ncbi_taxon_id: '10244' # Params for the transform rule transform: diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index b1c81cf4..b8411801 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -20,9 +20,11 @@ rule fetch_ncbi_dataset_package: retries: 5 # Requires snakemake 7.7.0 or later benchmark: "benchmarks/fetch_ncbi_dataset_package.txt" + params: + ncbi_taxon_id=config["ncbi_taxon_id"], shell: """ - datasets download virus genome taxon 10244 \ + datasets download virus genome taxon {params.ncbi_taxon_id} \ --no-progressbar \ --filename {output.dataset_package} """ diff --git a/phylogenetic/config/description.md b/phylogenetic/config/description.md index c4182b33..99a8fb59 100644 --- a/phylogenetic/config/description.md +++ b/phylogenetic/config/description.md @@ -18,7 +18,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mon #### Underlying data We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/), -using NCBI Taxonomy ID "10244", as starting point for these analyses. +using an NCBI Taxonomy ID defined in [ingest/config/config.yaml](https://github.com/nextstrain/monkeypox/blob/master/ingest/config/config.yaml), as starting point for these analyses. Curated sequences and metadata are available as flat files at: - [data.nextstrain.org/files/workflows/monkeypox/sequences.fasta.xz](https://data.nextstrain.org/files/workflows/monkeypox/sequences.fasta.xz) - [data.nextstrain.org/files/workflows/monkeypox/metadata.tsv.gz](https://data.nextstrain.org/files/workflows/monkeypox/metadata.tsv.gz)