diff --git a/.gitignore b/.gitignore index ef13b00..a280ae4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,40 +1,25 @@ -# Files created by the pipeline, which we want to keep out of git -# (or at least out of _this_ git repo). +# pipeline output # +benchmarks/ data/ +logs/ results/ -build/ -# Sensitive environment variables -environment* - -# Snakemake state dir -/.snakemake - -# Local config overrides -/config_local.yaml +# snakemake output # +.snakemake +snakemake_log # For Python # -############## *.pyc .tox/ .cache/ - -# Compiled source # -################### -*.com -*.class -*.dll -*.exe -*.o -*.so +__pycache__/* # OS generated files # -###################### .DS_Store -.DS_Store? ._* .Spotlight-V100 .Trashes Icon? ehthumbs.db Thumbs.db +*~ diff --git a/ingest/README.md b/ingest/README.md index c3b3a16..e89d2d6 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -1,105 +1,42 @@ -# Ingest +# Ingest workflow This workflow ingests public data from NCBI and outputs curated metadata and sequences that can be used as input for the phylogenetic workflow. -## Workflow Usage +If you have another data source or private data that needs to be +formatted for the phylogenetic workflow, then you can use a similar +workflow to curate your own data. -The workflow can be run from the top level pathogen repo directory: +## Config -```bash -nextstrain build ingest -``` - -Alternatively, the workflow can also be run from within the ingest -directory: - -```bash -cd ingest -nextstrain build . -``` - -This produces the default outputs of the ingest workflow: - -- metadata = results/metadata.tsv -- sequences = results/sequences.fasta - -### Dumping the full raw metadata from NCBI Datasets - -The workflow has a target for dumping the full raw metadata from NCBI -Datasets. - -```bash -nextstrain build ingest dump_ncbi_dataset_report -``` - -This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`, -which you can inspect to determine what fields and data to use if you -want to configure the workflow for your pathogen. - -## Defaults - -The defaults directory contains all of the default configurations for +The config directory contains all of the default configurations for the ingest workflow. -[defaults/config.yaml](defaults/config.yaml) contains all of the -default configuration parameters used for the ingest workflow. Use -Snakemake's `--configfile`/`--config` options to override these -default values. +[defaults/config.yaml][] contains all of the default configuration +parameters used for the ingest workflow. Use Snakemake's +`--configfile`/`--config` options to override these default values. ## Snakefile and rules The rules directory contains separate Snakefiles (`*.smk`) as modules of the core ingest workflow. The modules of the workflow are in -separate files to keep the main ingest [Snakefile](Snakefile) succinct -and organized. - -The `workdir` is hardcoded to be the ingest directory so all filepaths -for inputs/outputs should be relative to the ingest directory. - -Modules are all -[included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes) -in the main Snakefile in the order that they are expected to run. - -### Nextclade - -Nextstrain is pushing to standardize ingest workflows with Nextclade -runs to include Nextclade outputs in our publicly hosted data. -However, if a Nextclade dataset does not already exist, it requires -curated data as input, so we are making Nextclade steps optional here. - -If Nextclade config values are included, the Nextclade rules will -create the final metadata TSV by joining the Nextclade output with the -metadata. If Nextclade configs are not included, we rename the subset -metadata TSV to the final metadata TSV. - -To run Nextclade rules, include the `defaults/nextclade_config.yaml` -config file with: - -```bash -nextstrain build ingest --configfile defaults/nextclade_config.yaml -``` - -> [!TIP] -> If the Nextclade dataset is stable and you always want to run the -> Nextclade rules as part of ingest, we recommend moving the Nextclade -> related config parameters from the `defaults/nextclade_config.yaml` -> file to the default config file `defaults/config.yaml`. - -## Build configs - -The build-configs directory contains custom configs and rules that -override and/or extend the default workflow. - -- [nextstrain-automation](build-configs/nextstrain-automation/) - automated internal Nextstrain builds. +separate files to keep the main ingest [Snakefile][] succinct and +organized. Modules are all [included][] in the main Snakefile in the +order that they are expected to run. ## Vendored -This repository uses -[`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies -of ingest scripts in [vendored](vendored), from -[nextstrain/ingest](https://github.com/nextstrain/ingest). +This repository uses [`git subrepo`][] to manage copies of ingest +scripts in [vendored][], from [nextstrain/ingest][] + +See [vendored/README.md][] for instructions on how to update the +vendored scripts. -See [vendored/README.md](vendored/README.md#vendoring) for -instructions on how to update the vendored scripts. +[defaults/config.yaml]: ./config/defaults.yaml +[`git subrepo`]: https://github.com/ingydotnet/git-subrepo +[included]: https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes +[nextstrain/ingest]: https://github.com/nextstrain/ingest +[Snakefile]: ./Snakefile +[vendored]: ./vendored +[vendored/README.md]: ./vendored/README.md#vendoring diff --git a/ingest/Snakefile b/ingest/Snakefile index 98b14a2..5d81090 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,58 +1,20 @@ -""" -This is the main ingest Snakefile that orchestrates the full ingest workflow -and defines its default outputs. -""" - - -# The workflow filepaths are written relative to this Snakefile's base -# directory -workdir: workflow.current_basedir - - -# Use default configuration values. Override with Snakemake's -# --configfile/--config options. +# Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "defaults/config.yaml" -# This is the default rule that Snakemake will run when there are no -# specified targets. The default output of the ingest workflow is -# usually the curated metadata and sequences. Nextstrain-maintained -# ingest workflows will produce metadata files with the standard -# Nextstrain fields and additional fields that are pathogen specific. -# We recommend using these standard fields in custom ingests as well -# to minimize the customizations you will need for the downstream -# phylogenetic workflow. - - -# TODO: Add link to centralized docs on standard Nextstrain metadata fields rule all: input: "results/sequences.fasta", "results/metadata.tsv", -# Note that only PATHOGEN-level customizations should be added to -# these core steps, meaning they are custom rules necessary for all -# builds of the pathogen. If there are build-specific customizations, -# they should be added with the custom_rules imported below to ensure -# that the core workflow is not complicated by build-specific rules. include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" - -# We are pushing to standardize ingest workflows with Nextclade runs -# to include Nextclade outputs in our publicly hosted data. However, -# if a Nextclade dataset does not already exist, creating one requires -# curated data as input, so we are making Nextclade steps optional -# here. -# -# If Nextclade config values are included, the nextclade rules will -# create the final metadata TSV by joining the Nextclade output with -# the metadata. If Nextclade configs are not included, we rename the -# subset metadata TSV to the final metadata TSV. To run nextclade.smk -# rules, include the `defaults/nextclade_config.yaml` config file with -# `nextstrain build ingest --configfile -# defaults/nextclade_config.yaml`. +# If included, the nextclade rules will create the final metadata TSV +# by joining the Nextclade output with the metadata. However, if not +# including nextclade, we have to rename the subset metadata TSV to +# the final metadata TSV. if "nextclade" in config: include: "rules/nextclade.smk" @@ -66,21 +28,20 @@ else: metadata="results/metadata.tsv", shell: """ - mv {input.metadata} {output.metadata} + mv {input.metadata:q} {output.metadata:q} """ -# Allow users to import custom rules provided via the config. -# This allows users to run custom rules that can extend or override -# the workflow. A concrete example of using custom rules is the -# extension of the workflow with rules to support the Nextstrain -# automation that uploads files and sends internal Slack -# notifications. For extensions, the user will have to specify the -# custom rule targets when running the workflow. For overrides, the -# custom Snakefile will have to use the `ruleorder` directive to allow -# Snakemake to handle ambiguous rules -# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules -if "custom_rules" in config: - for rule_file in config["custom_rules"]: - include: rule_file +rule clean: + params: + targets = [ + "benchmarks", + "data", + "logs", + "results", + ] + shell: + """ + rm -rfv {params.targets} + """ diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index e9af859..ae4c1ca 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -1,14 +1,5 @@ -# This configuration file should contain all required configuration parameters -# for the ingest workflow to run to completion. -# -# Define optional config parameters with their default values here so that users -# do not have to dig through the workflows to figure out the default values - -# Required to fetch from Entrez -entrez_search_term: "" - -# Required to fetch from NCBI Datasets -ncbi_taxon_id: "" +# taxon for `yellow fever virus` +ncbi_taxon_id: "11089" # The list of NCBI Datasets fields to include from NCBI Datasets output # These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields @@ -34,16 +25,18 @@ ncbi_datasets_fields: # Config parameters related to the curate pipeline curate: - # URL pointed to public generalized geolocation rules + # URL pointed to public generalized geolocation rules. # For the Nextstrain team, this is currently - # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" + # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv". geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" # The path to the local geolocation rules within the pathogen repo # The path should be relative to the ingest directory. local_geolocation_rules: "defaults/geolocation_rules.tsv" - # List of field names to change where the key is the original field name and the value is the new field name - # The original field names should match the ncbi_datasets_fields provided above. - # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names + # List of field names to change where the key is the original field + # name and the value is the new field name. The original field names + # should match the ncbi_datasets_fields provided above. This is the + # first step in the pipeline, so any references to field names in + # the configs below should use the new field names. field_map: accession: accession accession_version: accession_version @@ -69,8 +62,9 @@ curate: strain_backup_fields: ["accession"] # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: ["date", "date_released", "date_updated"] - # List of expected date formats that are present in the date fields provided above - # These date formats should use directives expected by datetime + # List of expected date formats that are present in the date fields + # provided above. These date formats should use directives expected + # by datetime. # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"] titlecase: diff --git a/ingest/defaults/nextclade_config.yaml b/ingest/defaults/nextclade_config.yaml deleted file mode 100644 index 3c48bc8..0000000 --- a/ingest/defaults/nextclade_config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow -# Note that this requires a Nextclade dataset to already exist for your pathogen. -nextclade: - # The name of the Nextclade dataset to use for running nextclade. - # Run `nextclade dataset list` to get a full list of available Nextclade datasets - dataset_name: "" - # Path to the mapping for renaming Nextclade output columns - # The path should be relative to the ingest directory - field_map: "config/nextclade_field_map.tsv" - # This is the ID field you would use to match the Nextclade output with the record metadata. - # This should be the new name that you have defined in your field map. - id_field: "seqName" diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv deleted file mode 100644 index 513b0fd..0000000 --- a/ingest/defaults/nextclade_field_map.tsv +++ /dev/null @@ -1,18 +0,0 @@ -# TSV file that is a mapping of column names for Nextclade output TSV -# The first column should be the original column name of the Nextclade TSV -# The second column should be the new column name to use in the final metadata TSV -# Nextclade can have pathogen specific output columns so make sure to check which -# columns would be useful for your downstream phylogenetic analysis. -seqName seqName -clade clade -lineage lineage -coverage coverage -totalMissing missing_data -totalSubstitutions divergence -totalNonACGTNs nonACGTN -qc.missingData.status QC_missing_data -qc.mixedSites.status QC_mixed_sites -qc.privateMutations.status QC_rare_mutations -qc.frameShifts.status QC_frame_shifts -qc.stopCodons.status QC_stop_codons -frameShifts frame_shifts diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 112eb34..bf7bfa8 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -13,15 +13,13 @@ OUTPUTS: """ -# The following two rules can be ignored if you choose not to use the -# generalized geolocation rules that are shared across pathogens. -# The Nextstrain team will try to maintain a generalized set of geolocation -# rules that can then be overridden by local geolocation rules per pathogen repo. rule fetch_general_geolocation_rules: output: general_geolocation_rules="data/general-geolocation-rules.tsv", params: geolocation_rules_url=config["curate"]["geolocation_rules_url"], + benchmark: + "benchmarks/fetch_general_geolocation_rules.txt" shell: """ curl {params.geolocation_rules_url} > {output.general_geolocation_rules} @@ -34,10 +32,12 @@ rule concat_geolocation_rules: local_geolocation_rules=config["curate"]["local_geolocation_rules"], output: all_geolocation_rules="data/all-geolocation-rules.tsv", + benchmark: + "benchmarks/concat_geolocation_rules.txt" shell: - # why is this `>>` and not `>` """ - cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules} + cat {input.general_geolocation_rules} {input.local_geolocation_rules} \ + > {output.all_geolocation_rules} """ @@ -48,17 +48,9 @@ def format_field_map(field_map: dict[str, str]) -> str: return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()]) -# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data. -# You may want to add and/or remove steps from the pipeline for custom metadata -# curation for your pathogen. Note that the curate pipeline is streaming NDJSON -# records between scripts, so any custom scripts added to the pipeline should expect -# the input as NDJSON records from stdin and output NDJSON records to stdout. -# The final step of the pipeline should convert the NDJSON records to two -# separate files: a metadata TSV and a sequences FASTA. rule curate: input: sequences_ndjson="data/ncbi.ndjson", - # Change the geolocation_rules input path if you are removing the above two rules all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["curate"]["annotations"], output: @@ -124,8 +116,11 @@ rule subset_metadata: subset_metadata="data/subset_metadata.tsv", params: metadata_fields=",".join(config["curate"]["metadata_columns"]), + benchmark: + "benchmarks/subset_metadata.txt" shell: """ tsv-select -H -f {params.metadata_fields} \ - {input.metadata} > {output.subset_metadata} + {input.metadata} \ + > {output.subset_metadata} """ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index ed350ce..2194c6f 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -9,35 +9,8 @@ OUTPUTS: ndjson = data/ncbi.ndjson -There are two different approaches for fetching data from NCBI. -Choose the one that works best for the pathogen data and edit the workflow config -to provide the correct parameter. - -1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/) - - requires `ncbi_taxon_id` config - - Directly returns NDJSON without custom parsing - - Fastest option for large datasets (e.g. SARS-CoV-2) - - Only returns metadata fields that are available through NCBI Datasets - - Only works for viral genomes - -2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/) - - requires `entrez_search_term` config - - Returns all available data via a GenBank file - - Requires a custom script to parse the necessary fields from the GenBank file """ - -# This ruleorder determines which rule to use to produce the final NCBI NDJSON file. -# The default is set to use NCBI Datasets since it does not require a custom script. -# Switch the rule order if you plan to use Entrez -ruleorder: format_ncbi_datasets_ndjson > parse_genbank_to_ndjson - - -########################################################################### -####################### 1. Fetch from NCBI Datasets ####################### -########################################################################### - - rule fetch_ncbi_dataset_package: params: ncbi_taxon_id=config["ncbi_taxon_id"], @@ -48,7 +21,6 @@ rule fetch_ncbi_dataset_package: benchmark: "benchmarks/fetch_ncbi_dataset_package.txt" shell: - # what's the `:q` mean """ datasets download virus genome taxon {params.ncbi_taxon_id:q} \ --no-progressbar \ @@ -56,18 +28,21 @@ rule fetch_ncbi_dataset_package: """ -# Note: This rule is not part of the default workflow! -# It is intended to be used as a specific target for users to be able -# to inspect and explore the full raw metadata from NCBI Datasets. +# Note: This rule is not part of the default workflow! It is intended +# to be used as a specific target to be able to inspect and explore +# the full raw metadata from NCBI Datasets. rule dump_ncbi_dataset_report: input: dataset_package="data/ncbi_dataset.zip", output: ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv", + benchmark: + "benchmarks/dump_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ - --package {input.dataset_package} > {output.ncbi_dataset_tsv} + --package {input.dataset_package} \ + > {output.ncbi_dataset_tsv} """ @@ -76,13 +51,13 @@ rule extract_ncbi_dataset_sequences: dataset_package="data/ncbi_dataset.zip", output: ncbi_dataset_sequences=temp("data/ncbi_dataset_sequences.fasta"), - # why benchmarks here but not elsewhere benchmark: "benchmarks/extract_ncbi_dataset_sequences.txt" shell: """ unzip -jp {input.dataset_package} \ - ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences} + ncbi_dataset/data/genomic.fna \ + > {output.ncbi_dataset_sequences} """ @@ -111,10 +86,6 @@ rule format_ncbi_dataset_report: """ -# Technically you can bypass this step and directly provide FASTA and TSV files -# as input files for the curate pipeline. -# We do the formatting here to have a uniform NDJSON file format for the raw -# data that we host on data.nextstrain.org rule format_ncbi_datasets_ndjson: input: ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta", @@ -136,38 +107,3 @@ rule format_ncbi_datasets_ndjson: --duplicate-reporting warn \ 2> {log} > {output.ndjson} """ - - -########################################################################### -########################## 2. Fetch from Entrez ########################### -########################################################################### - - -rule fetch_from_ncbi_entrez: - params: - term=config["entrez_search_term"], - output: - genbank="data/genbank.gb", - # Allow retries in case of network errors - retries: 5 - benchmark: - "benchmarks/fetch_from_ncbi_entrez.txt" - shell: - """ - vendored/fetch-from-ncbi-entrez \ - --term {params.term:q} \ - --output {output.genbank} - """ - - -rule parse_genbank_to_ndjson: - input: - genbank="data/genbank.gb", - output: - ndjson="data/ncbi.ndjson", - benchmark: - "benchmarks/parse_genbank_to_ndjson.txt" - shell: - """ - # Add in custom script to parse needed fields from GenBank file to NDJSON file - """ diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk deleted file mode 100644 index ffbeab8..0000000 --- a/ingest/rules/nextclade.smk +++ /dev/null @@ -1,97 +0,0 @@ -""" -This part of the workflow handles running Nextclade on the curated metadata -and sequences. - -REQUIRED INPUTS: - - metadata = data/subset_metadata.tsv - sequences = results/sequences.fasta - -OUTPUTS: - - metadata = results/metadata.tsv - nextclade = results/nextclade.tsv - alignment = results/alignment.fasta - translations = results/translations.zip - -See Nextclade docs for more details on usage, inputs, and outputs if you would -like to customize the rules: -https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html -""" - -DATASET_NAME = config["nextclade"]["dataset_name"] - - -rule get_nextclade_dataset: - """Download Nextclade dataset""" - output: - dataset=f"data/nextclade_data/{DATASET_NAME}.zip", - params: - dataset_name=DATASET_NAME, - shell: - # should this get updated to `nextclade3`? - """ - nextclade2 dataset get \ - --name={params.dataset_name:q} \ - --output-zip={output.dataset} \ - --verbose - """ - - -rule run_nextclade: - input: - dataset=f"data/nextclade_data/{DATASET_NAME}.zip", - sequences="results/sequences.fasta", - output: - nextclade="results/nextclade.tsv", - alignment="results/alignment.fasta", - translations="results/translations.zip", - params: - # The lambda is used to deactivate automatic wildcard expansion. - # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 - translations=lambda w: "results/translations/{gene}.fasta", - shell: - """ - nextclade2 run \ - {input.sequences} \ - --input-dataset {input.dataset} \ - --output-tsv {output.nextclade} \ - --output-fasta {output.alignment} \ - --output-translations {params.translations} - - zip -rj {output.translations} results/translations - """ - - -rule join_metadata_and_nextclade: - input: - nextclade="results/nextclade.tsv", - metadata="data/subset_metadata.tsv", - nextclade_field_map=config["nextclade"]["field_map"], - output: - metadata="results/metadata.tsv", - params: - metadata_id_field=config["curate"]["output_id_field"], - nextclade_id_field=config["nextclade"]["id_field"], - shell: - """ - export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` - - csvtk -tl cut -f $SUBSET_FIELDS \ - {input.nextclade} \ - | csvtk -tl rename2 \ - -F \ - -f '*' \ - -p '(.+)' \ - -r '{{kv}}' \ - -k {input.nextclade_field_map} \ - | tsv-join -H \ - --filter-file - \ - --key-fields {params.nextclade_id_field} \ - --data-fields {params.metadata_id_field} \ - --append-fields '*' \ - --write-all ? \ - {input.metadata} \ - | tsv-select -H --exclude {params.nextclade_id_field} \ - > {output.metadata} - """