From 3a2acd289683daf7450b6c9d9535df723438bc55 Mon Sep 17 00:00:00 2001
From: John SJ Anderson <janders4@fredhutch.org>
Date: Thu, 23 May 2024 10:18:03 -0700
Subject: [PATCH] Pull ingest and phylogenetic workflows from guide repo [#2]

Also apply pre-commit cleanup
---
 ingest/README.md                              | 105 +++++++++++
 ingest/Snakefile                              |  86 +++++++++
 .../nextstrain-automation/README.md           |  38 ++++
 .../nextstrain-automation/config.yaml         |  24 +++
 .../nextstrain-automation/upload.smk          |  43 +++++
 ingest/defaults/annotations.tsv               |   6 +
 ingest/defaults/config.yaml                   | 134 ++++++++++++++
 ingest/defaults/geolocation_rules.tsv         |   7 +
 ingest/defaults/nextclade_config.yaml         |  12 ++
 ingest/defaults/nextclade_field_map.tsv       |  18 ++
 ingest/rules/curate.smk                       | 131 +++++++++++++
 ingest/rules/fetch_from_ncbi.smk              | 173 ++++++++++++++++++
 ingest/rules/nextclade.smk                    |  97 ++++++++++
 nextstrain-pathogen.yaml                      |   5 +
 phylogenetic/README.md                        |  59 ++++++
 phylogenetic/Snakefile                        |  55 ++++++
 phylogenetic/build-configs/ci/config.yaml     |   7 +
 .../build-configs/ci/copy_example_data.smk    |  17 ++
 phylogenetic/defaults/config.yaml             |   5 +
 phylogenetic/example_data/metadata.tsv        |   0
 phylogenetic/example_data/sequences.fasta     |   0
 phylogenetic/rules/annotate_phylogeny.smk     |  32 ++++
 phylogenetic/rules/construct_phylogeny.smk    |  20 ++
 phylogenetic/rules/export.smk                 |  26 +++
 phylogenetic/rules/prepare_sequences.smk      |  22 +++
 25 files changed, 1122 insertions(+)
 create mode 100644 ingest/README.md
 create mode 100644 ingest/Snakefile
 create mode 100644 ingest/build-configs/nextstrain-automation/README.md
 create mode 100644 ingest/build-configs/nextstrain-automation/config.yaml
 create mode 100644 ingest/build-configs/nextstrain-automation/upload.smk
 create mode 100644 ingest/defaults/annotations.tsv
 create mode 100644 ingest/defaults/config.yaml
 create mode 100644 ingest/defaults/geolocation_rules.tsv
 create mode 100644 ingest/defaults/nextclade_config.yaml
 create mode 100644 ingest/defaults/nextclade_field_map.tsv
 create mode 100644 ingest/rules/curate.smk
 create mode 100644 ingest/rules/fetch_from_ncbi.smk
 create mode 100644 ingest/rules/nextclade.smk
 create mode 100644 nextstrain-pathogen.yaml
 create mode 100644 phylogenetic/README.md
 create mode 100644 phylogenetic/Snakefile
 create mode 100644 phylogenetic/build-configs/ci/config.yaml
 create mode 100644 phylogenetic/build-configs/ci/copy_example_data.smk
 create mode 100644 phylogenetic/defaults/config.yaml
 create mode 100644 phylogenetic/example_data/metadata.tsv
 create mode 100644 phylogenetic/example_data/sequences.fasta
 create mode 100644 phylogenetic/rules/annotate_phylogeny.smk
 create mode 100644 phylogenetic/rules/construct_phylogeny.smk
 create mode 100644 phylogenetic/rules/export.smk
 create mode 100644 phylogenetic/rules/prepare_sequences.smk

diff --git a/ingest/README.md b/ingest/README.md
new file mode 100644
index 0000000..c3b3a16
--- /dev/null
+++ b/ingest/README.md
@@ -0,0 +1,105 @@
+# Ingest
+
+This workflow ingests public data from NCBI and outputs curated
+metadata and sequences that can be used as input for the phylogenetic
+workflow.
+
+## Workflow Usage
+
+The workflow can be run from the top level pathogen repo directory:
+
+```bash
+nextstrain build ingest
+```
+
+Alternatively, the workflow can also be run from within the ingest
+directory:
+
+```bash
+cd ingest
+nextstrain build .
+```
+
+This produces the default outputs of the ingest workflow:
+
+- metadata      = results/metadata.tsv
+- sequences     = results/sequences.fasta
+
+### Dumping the full raw metadata from NCBI Datasets
+
+The workflow has a target for dumping the full raw metadata from NCBI
+Datasets.
+
+```bash
+nextstrain build ingest dump_ncbi_dataset_report
+```
+
+This will produce the file `ingest/data/ncbi_dataset_report_raw.tsv`,
+which you can inspect to determine what fields and data to use if you
+want to configure the workflow for your pathogen.
+
+## Defaults
+
+The defaults directory contains all of the default configurations for
+the ingest workflow.
+
+[defaults/config.yaml](defaults/config.yaml) contains all of the
+default configuration parameters used for the ingest workflow. Use
+Snakemake's `--configfile`/`--config` options to override these
+default values.
+
+## Snakefile and rules
+
+The rules directory contains separate Snakefiles (`*.smk`) as modules
+of the core ingest workflow. The modules of the workflow are in
+separate files to keep the main ingest [Snakefile](Snakefile) succinct
+and organized.
+
+The `workdir` is hardcoded to be the ingest directory so all filepaths
+for inputs/outputs should be relative to the ingest directory.
+
+Modules are all
+[included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes)
+in the main Snakefile in the order that they are expected to run.
+
+### Nextclade
+
+Nextstrain is pushing to standardize ingest workflows with Nextclade
+runs to include Nextclade outputs in our publicly hosted data.
+However, if a Nextclade dataset does not already exist, it requires
+curated data as input, so we are making Nextclade steps optional here.
+
+If Nextclade config values are included, the Nextclade rules will
+create the final metadata TSV by joining the Nextclade output with the
+metadata. If Nextclade configs are not included, we rename the subset
+metadata TSV to the final metadata TSV.
+
+To run Nextclade rules, include the `defaults/nextclade_config.yaml`
+config file with:
+
+```bash
+nextstrain build ingest --configfile defaults/nextclade_config.yaml
+```
+
+> [!TIP]
+> If the Nextclade dataset is stable and you always want to run the
+> Nextclade rules as part of ingest, we recommend moving the Nextclade
+> related config parameters from the `defaults/nextclade_config.yaml`
+> file to the default config file `defaults/config.yaml`.
+
+## Build configs
+
+The build-configs directory contains custom configs and rules that
+override and/or extend the default workflow.
+
+- [nextstrain-automation](build-configs/nextstrain-automation/) - automated internal Nextstrain builds.
+
+## Vendored
+
+This repository uses
+[`git subrepo`](https://github.com/ingydotnet/git-subrepo) to manage copies
+of ingest scripts in [vendored](vendored), from
+[nextstrain/ingest](https://github.com/nextstrain/ingest).
+
+See [vendored/README.md](vendored/README.md#vendoring) for
+instructions on how to update the vendored scripts.
diff --git a/ingest/Snakefile b/ingest/Snakefile
new file mode 100644
index 0000000..98b14a2
--- /dev/null
+++ b/ingest/Snakefile
@@ -0,0 +1,86 @@
+"""
+This is the main ingest Snakefile that orchestrates the full ingest workflow
+and defines its default outputs.
+"""
+
+
+# The workflow filepaths are written relative to this Snakefile's base
+# directory
+workdir: workflow.current_basedir
+
+
+# Use default configuration values. Override with Snakemake's
+# --configfile/--config options.
+configfile: "defaults/config.yaml"
+
+
+# This is the default rule that Snakemake will run when there are no
+# specified targets. The default output of the ingest workflow is
+# usually the curated metadata and sequences. Nextstrain-maintained
+# ingest workflows will produce metadata files with the standard
+# Nextstrain fields and additional fields that are pathogen specific.
+# We recommend using these standard fields in custom ingests as well
+# to minimize the customizations you will need for the downstream
+# phylogenetic workflow.
+
+
+# TODO: Add link to centralized docs on standard Nextstrain metadata fields
+rule all:
+    input:
+        "results/sequences.fasta",
+        "results/metadata.tsv",
+
+
+# Note that only PATHOGEN-level customizations should be added to
+# these core steps, meaning they are custom rules necessary for all
+# builds of the pathogen. If there are build-specific customizations,
+# they should be added with the custom_rules imported below to ensure
+# that the core workflow is not complicated by build-specific rules.
+include: "rules/fetch_from_ncbi.smk"
+include: "rules/curate.smk"
+
+
+# We are pushing to standardize ingest workflows with Nextclade runs
+# to include Nextclade outputs in our publicly hosted data. However,
+# if a Nextclade dataset does not already exist, creating one requires
+# curated data as input, so we are making Nextclade steps optional
+# here.
+#
+# If Nextclade config values are included, the nextclade rules will
+# create the final metadata TSV by joining the Nextclade output with
+# the metadata. If Nextclade configs are not included, we rename the
+# subset metadata TSV to the final metadata TSV. To run nextclade.smk
+# rules, include the `defaults/nextclade_config.yaml` config file with
+# `nextstrain build ingest --configfile
+# defaults/nextclade_config.yaml`.
+if "nextclade" in config:
+
+    include: "rules/nextclade.smk"
+
+else:
+
+    rule create_final_metadata:
+        input:
+            metadata="data/subset_metadata.tsv",
+        output:
+            metadata="results/metadata.tsv",
+        shell:
+            """
+            mv {input.metadata} {output.metadata}
+            """
+
+
+# Allow users to import custom rules provided via the config.
+# This allows users to run custom rules that can extend or override
+# the workflow. A concrete example of using custom rules is the
+# extension of the workflow with rules to support the Nextstrain
+# automation that uploads files and sends internal Slack
+# notifications. For extensions, the user will have to specify the
+# custom rule targets when running the workflow. For overrides, the
+# custom Snakefile will have to use the `ruleorder` directive to allow
+# Snakemake to handle ambiguous rules
+# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/ingest/build-configs/nextstrain-automation/README.md b/ingest/build-configs/nextstrain-automation/README.md
new file mode 100644
index 0000000..8eff7dd
--- /dev/null
+++ b/ingest/build-configs/nextstrain-automation/README.md
@@ -0,0 +1,38 @@
+# Nextstrain automation
+
+> [!NOTE]
+> External users can ignore this directory!
+> This build config/customization is tailored for the internal Nextstrain team
+> to extend the core ingest workflow for automated workflows.
+
+## Update the config
+
+Update the [config.yaml](config.yaml) for your pathogen:
+
+1. Edit the `s3_dst` param to add the pathogen repository name.
+2. Edit the `files_to_upload` param to a mapping of files you need to upload for your pathogen.
+The default includes suggested files for uploading curated data and Nextclade outputs.
+
+## Run the workflow
+
+Provide the additional config file to the Snakemake options in order to
+include the custom rules from [upload.smk](upload.smk) in the workflow.
+Specify the `upload_all` target in order to run the additional upload rules.
+
+The upload rules will require AWS credentials for a user that has permissions
+to upload to the Nextstrain data bucket.
+
+The customized workflow can be run from the top level pathogen repo directory with:
+```
+nextstrain build \
+    --env AWS_ACCESS_KEY_ID \
+    --env AWS_SECRET_ACCESS_KEY \
+    ingest \
+        upload_all \
+        --configfile build-configs/nextstrain-automation/config.yaml
+```
+
+## Automated GitHub Action workflows
+
+Additional instructions on how to use this with the shared `pathogen-repo-build`
+GitHub Action workflow to come!
diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
new file mode 100644
index 0000000..411399d
--- /dev/null
+++ b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -0,0 +1,24 @@
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run with additional Nextstrain automation rules.
+
+# Custom rules to run as part of the Nextstrain automated workflow The
+# paths should be relative to the ingest directory.
+custom_rules:
+  - build-configs/nextstrain-automation/upload.smk
+
+# Nextstrain CloudFront domain to ensure that we invalidate CloudFront
+# after the S3 uploads This is required as long as we are using the
+# AWS CLI for uploads
+cloudfront_domain: "data.nextstrain.org"
+
+# Nextstrain AWS S3 Bucket with pathogen prefix
+s3_dst: "s3://nextstrain-data/files/workflows/seasonal-cov"
+
+# Mapping of files to upload
+## TODO verify
+files_to_upload:
+  ncbi.ndjson.zst: data/ncbi.ndjson
+  metadata.tsv.zst: results/metadata.tsv
+  sequences.fasta.zst: results/sequences.fasta
+  alignments.fasta.zst: results/alignment.fasta
+  translations.zip: results/translations.zip
diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk
new file mode 100644
index 0000000..6954966
--- /dev/null
+++ b/ingest/build-configs/nextstrain-automation/upload.smk
@@ -0,0 +1,43 @@
+"""
+This part of the workflow handles uploading files to AWS S3.
+
+Files to upload must be defined in the `files_to_upload` config param, where
+the keys are the remote files and the values are the local filepaths
+relative to the ingest directory.
+
+Produces a single file for each uploaded file:
+    "results/upload/{remote_file}.upload"
+
+The rule `upload_all` can be used as a target to upload all files.
+"""
+
+import os
+
+slack_envvars_defined = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ
+send_notifications = config.get("send_slack_notifications", False) and slack_envvars_defined
+
+
+rule upload_to_s3:
+    input:
+        file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file],
+    output:
+        "results/upload/{remote_file}.upload",
+    params:
+        quiet="" if send_notifications else "--quiet",
+        s3_dst=config["s3_dst"],
+        cloudfront_domain=config["cloudfront_domain"],
+    shell:
+        """
+        ./vendored/upload-to-s3 \
+            {params.quiet} \
+            {input.file_to_upload:q} \
+            {params.s3_dst:q}/{wildcards.remote_file:q} \
+            {params.cloudfront_domain} 2>&1 | tee {output}
+        """
+
+
+rule upload_all:
+    input:
+        uploads=[f"results/upload/{remote_file}.upload" for remote_file in config["files_to_upload"].keys()],
+    output:
+        touch("results/upload_all.done"),
diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
new file mode 100644
index 0000000..89c0059
--- /dev/null
+++ b/ingest/defaults/annotations.tsv
@@ -0,0 +1,6 @@
+# Manually curated annotations TSV file
+# The TSV should not have a header and should have exactly three columns:
+# id to match existing metadata, field name, and field value
+# If there are multiple annotations for the same id and field, then the last value is used
+# Lines starting with '#' are treated as comments
+# Any '#' after the field value are treated as comments.
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
new file mode 100644
index 0000000..e9af859
--- /dev/null
+++ b/ingest/defaults/config.yaml
@@ -0,0 +1,134 @@
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run to completion.
+#
+# Define optional config parameters with their default values here so that users
+# do not have to dig through the workflows to figure out the default values
+
+# Required to fetch from Entrez
+entrez_search_term: ""
+
+# Required to fetch from NCBI Datasets
+ncbi_taxon_id: ""
+
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
+
+# Config parameters related to the curate pipeline
+curate:
+  # URL pointed to public generalized geolocation rules
+  # For the Nextstrain team, this is currently
+  # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  # The path to the local geolocation rules within the pathogen repo
+  # The path should be relative to the ingest directory.
+  local_geolocation_rules: "defaults/geolocation_rules.tsv"
+  # List of field names to change where the key is the original field name and the value is the new field name
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
+  field_map:
+    accession: accession
+    accession_version: accession_version
+    sourcedb: database
+    sra-accs: sra_accessions
+    isolate-lineage: strain
+    geo-region: region
+    geo-location: location
+    isolate-collection-date: date
+    release-date: date_released
+    update-date: date_updated
+    length: length
+    host-name: host
+    isolate-lineage-source: sample_type
+    biosample-acc: biosample_accessions
+    submitter-names: authors
+    submitter-affiliation: institution
+    submitter-country: submitter_country
+  # Standardized strain name regex
+  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
+  strain_regex: "^.+$"
+  # Back up strain name field to use if "strain" doesn"t match regex above
+  strain_backup_fields: ["accession"]
+  # List of date fields to standardize to ISO format YYYY-MM-DD
+  date_fields: ["date", "date_released", "date_updated"]
+  # List of expected date formats that are present in the date fields provided above
+  # These date formats should use directives expected by datetime
+  # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
+  expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
+  titlecase:
+    # List of string fields to titlecase
+    fields: ["region", "country", "division", "location"]
+    # List of abbreviations not cast to titlecase, keeps uppercase
+    abbreviations: ["USA"]
+    # Articles that should not be cast to titlecase
+    articles:
+      - and
+      - d
+      - de
+      - del
+      - des
+      - di
+      - do
+      - en
+      - l
+      - la
+      - las
+      - le
+      - los
+      - nad
+      - of
+      - op
+      - sur
+      - the
+      - y
+  # Metadata field that contains the list of authors associated with the sequence
+  authors_field: "authors"
+  # Default value to use if the authors field is empty
+  authors_default_value: "?"
+  # Name to use for the generated abbreviated authors field
+  abbr_authors_field: "abbr_authors"
+  # Path to the manual annotations file
+  # The path should be relative to the ingest directory
+  annotations: "defaults/annotations.tsv"
+  # The ID field in the metadata to use to merge the manual annotations
+  annotations_id: "accession"
+  # The ID field in the metadata to use as the sequence id in the output FASTA file
+  output_id_field: "accession"
+  # The field in the NDJSON record that contains the actual genomic sequence
+  output_sequence_field: "sequence"
+  # The list of metadata columns to keep in the final output of the curation pipeline.
+  metadata_columns:
+    - accession
+    - accession_version
+    - strain
+    - date
+    - region
+    - country
+    - division
+    - location
+    - length
+    - host
+    - date_released
+    - date_updated
+    - sra_accessions
+    - authors
+    - abbr_authors
+    - institution
diff --git a/ingest/defaults/geolocation_rules.tsv b/ingest/defaults/geolocation_rules.tsv
new file mode 100644
index 0000000..bb5f7d2
--- /dev/null
+++ b/ingest/defaults/geolocation_rules.tsv
@@ -0,0 +1,7 @@
+# TSV file of geolocation rules with the format:
+# '<raw_geolocation><tab><annotated_geolocation>' where the raw and annotated geolocations
+# are formatted as '<region>/<country>/<division>/<location>'
+# If creating a general rule, then the raw field value can be substituted with '*'
+# Lines starting with '#' will be ignored as comments.
+# Trailing '#' will be ignored as comments.
+# TODO example line?
diff --git a/ingest/defaults/nextclade_config.yaml b/ingest/defaults/nextclade_config.yaml
new file mode 100644
index 0000000..3c48bc8
--- /dev/null
+++ b/ingest/defaults/nextclade_config.yaml
@@ -0,0 +1,12 @@
+# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
+# Note that this requires a Nextclade dataset to already exist for your pathogen.
+nextclade:
+  # The name of the Nextclade dataset to use for running nextclade.
+  # Run `nextclade dataset list` to get a full list of available Nextclade datasets
+  dataset_name: ""
+  # Path to the mapping for renaming Nextclade output columns
+  # The path should be relative to the ingest directory
+  field_map: "config/nextclade_field_map.tsv"
+  # This is the ID field you would use to match the Nextclade output with the record metadata.
+  # This should be the new name that you have defined in your field map.
+  id_field: "seqName"
diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv
new file mode 100644
index 0000000..513b0fd
--- /dev/null
+++ b/ingest/defaults/nextclade_field_map.tsv
@@ -0,0 +1,18 @@
+# TSV file that is a mapping of column names for Nextclade output TSV
+# The first column should be the original column name of the Nextclade TSV
+# The second column should be the new column name to use in the final metadata TSV
+# Nextclade can have pathogen specific output columns so make sure to check which
+# columns would be useful for your downstream phylogenetic analysis.
+seqName	seqName
+clade	clade
+lineage	lineage
+coverage	coverage
+totalMissing	missing_data
+totalSubstitutions	divergence
+totalNonACGTNs	nonACGTN
+qc.missingData.status	QC_missing_data
+qc.mixedSites.status	QC_mixed_sites
+qc.privateMutations.status	QC_rare_mutations
+qc.frameShifts.status	QC_frame_shifts
+qc.stopCodons.status	QC_stop_codons
+frameShifts	frame_shifts
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
new file mode 100644
index 0000000..112eb34
--- /dev/null
+++ b/ingest/rules/curate.smk
@@ -0,0 +1,131 @@
+"""
+This part of the workflow handles the curation of data from NCBI
+
+REQUIRED INPUTS:
+
+    ndjson      = data/ncbi.ndjson
+
+OUTPUTS:
+
+    metadata    = data/subset_metadata.tsv
+    seuqences   = results/sequences.fasta
+
+"""
+
+
+# The following two rules can be ignored if you choose not to use the
+# generalized geolocation rules that are shared across pathogens.
+# The Nextstrain team will try to maintain a generalized set of geolocation
+# rules that can then be overridden by local geolocation rules per pathogen repo.
+rule fetch_general_geolocation_rules:
+    output:
+        general_geolocation_rules="data/general-geolocation-rules.tsv",
+    params:
+        geolocation_rules_url=config["curate"]["geolocation_rules_url"],
+    shell:
+        """
+        curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
+        """
+
+
+rule concat_geolocation_rules:
+    input:
+        general_geolocation_rules="data/general-geolocation-rules.tsv",
+        local_geolocation_rules=config["curate"]["local_geolocation_rules"],
+    output:
+        all_geolocation_rules="data/all-geolocation-rules.tsv",
+    shell:
+        # why is this `>>` and not `>`
+        """
+        cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules}
+        """
+
+
+def format_field_map(field_map: dict[str, str]) -> str:
+    """
+    Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
+    """
+    return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])
+
+
+# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data.
+# You may want to add and/or remove steps from the pipeline for custom metadata
+# curation for your pathogen. Note that the curate pipeline is streaming NDJSON
+# records between scripts, so any custom scripts added to the pipeline should expect
+# the input as NDJSON records from stdin and output NDJSON records to stdout.
+# The final step of the pipeline should convert the NDJSON records to two
+# separate files: a metadata TSV and a sequences FASTA.
+rule curate:
+    input:
+        sequences_ndjson="data/ncbi.ndjson",
+        # Change the geolocation_rules input path if you are removing the above two rules
+        all_geolocation_rules="data/all-geolocation-rules.tsv",
+        annotations=config["curate"]["annotations"],
+    output:
+        metadata="data/all_metadata.tsv",
+        sequences="results/sequences.fasta",
+    log:
+        "logs/curate.txt",
+    benchmark:
+        "benchmarks/curate.txt"
+    params:
+        field_map=format_field_map(config["curate"]["field_map"]),
+        strain_regex=config["curate"]["strain_regex"],
+        strain_backup_fields=config["curate"]["strain_backup_fields"],
+        date_fields=config["curate"]["date_fields"],
+        expected_date_formats=config["curate"]["expected_date_formats"],
+        articles=config["curate"]["titlecase"]["articles"],
+        abbreviations=config["curate"]["titlecase"]["abbreviations"],
+        titlecase_fields=config["curate"]["titlecase"]["fields"],
+        authors_field=config["curate"]["authors_field"],
+        authors_default_value=config["curate"]["authors_default_value"],
+        abbr_authors_field=config["curate"]["abbr_authors_field"],
+        annotations_id=config["curate"]["annotations_id"],
+        id_field=config["curate"]["output_id_field"],
+        sequence_field=config["curate"]["output_sequence_field"],
+    shell:
+        """
+        (cat {input.sequences_ndjson} \
+            | ./vendored/transform-field-names \
+                --field-map {params.field_map} \
+            | augur curate normalize-strings \
+            | ./vendored/transform-strain-names \
+                --strain-regex {params.strain_regex} \
+                --backup-fields {params.strain_backup_fields} \
+            | augur curate format-dates \
+                --date-fields {params.date_fields} \
+                --expected-date-formats {params.expected_date_formats} \
+            | ./vendored/transform-genbank-location \
+            | augur curate titlecase \
+                --titlecase-fields {params.titlecase_fields} \
+                --articles {params.articles} \
+                --abbreviations {params.abbreviations} \
+            | ./vendored/transform-authors \
+                --authors-field {params.authors_field} \
+                --default-value {params.authors_default_value} \
+                --abbr-authors-field {params.abbr_authors_field} \
+            | ./vendored/apply-geolocation-rules \
+                --geolocation-rules {input.all_geolocation_rules} \
+            | ./vendored/merge-user-metadata \
+                --annotations {input.annotations} \
+                --id-field {params.annotations_id} \
+            | augur curate passthru \
+                --output-metadata {output.metadata} \
+                --output-fasta {output.sequences} \
+                --output-id-field {params.id_field} \
+                --output-seq-field {params.sequence_field} ) 2>> {log}
+        """
+
+
+rule subset_metadata:
+    input:
+        metadata="data/all_metadata.tsv",
+    output:
+        subset_metadata="data/subset_metadata.tsv",
+    params:
+        metadata_fields=",".join(config["curate"]["metadata_columns"]),
+    shell:
+        """
+        tsv-select -H -f {params.metadata_fields} \
+            {input.metadata} > {output.subset_metadata}
+        """
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
new file mode 100644
index 0000000..ed350ce
--- /dev/null
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -0,0 +1,173 @@
+"""
+This part of the workflow handles fetching sequences and metadata from NCBI.
+
+REQUIRED INPUTS:
+
+    None
+
+OUTPUTS:
+
+    ndjson = data/ncbi.ndjson
+
+There are two different approaches for fetching data from NCBI.
+Choose the one that works best for the pathogen data and edit the workflow config
+to provide the correct parameter.
+
+1. Fetch with NCBI Datasets (https://www.ncbi.nlm.nih.gov/datasets/)
+    - requires `ncbi_taxon_id` config
+    - Directly returns NDJSON without custom parsing
+    - Fastest option for large datasets (e.g. SARS-CoV-2)
+    - Only returns metadata fields that are available through NCBI Datasets
+    - Only works for viral genomes
+
+2. Fetch from Entrez (https://www.ncbi.nlm.nih.gov/books/NBK25501/)
+    - requires `entrez_search_term` config
+    - Returns all available data via a GenBank file
+    - Requires a custom script to parse the necessary fields from the GenBank file
+"""
+
+
+# This ruleorder determines which rule to use to produce the final NCBI NDJSON file.
+# The default is set to use NCBI Datasets since it does not require a custom script.
+# Switch the rule order if you plan to use Entrez
+ruleorder: format_ncbi_datasets_ndjson > parse_genbank_to_ndjson
+
+
+###########################################################################
+####################### 1. Fetch from NCBI Datasets #######################
+###########################################################################
+
+
+rule fetch_ncbi_dataset_package:
+    params:
+        ncbi_taxon_id=config["ncbi_taxon_id"],
+    output:
+        dataset_package=temp("data/ncbi_dataset.zip"),
+    # Allow retries in case of network errors
+    retries: 5
+    benchmark:
+        "benchmarks/fetch_ncbi_dataset_package.txt"
+    shell:
+        # what's the `:q` mean
+        """
+        datasets download virus genome taxon {params.ncbi_taxon_id:q} \
+            --no-progressbar \
+            --filename {output.dataset_package}
+        """
+
+
+# Note: This rule is not part of the default workflow!
+# It is intended to be used as a specific target for users to be able
+# to inspect and explore the full raw metadata from NCBI Datasets.
+rule dump_ncbi_dataset_report:
+    input:
+        dataset_package="data/ncbi_dataset.zip",
+    output:
+        ncbi_dataset_tsv="data/ncbi_dataset_report_raw.tsv",
+    shell:
+        """
+        dataformat tsv virus-genome \
+            --package {input.dataset_package} > {output.ncbi_dataset_tsv}
+        """
+
+
+rule extract_ncbi_dataset_sequences:
+    input:
+        dataset_package="data/ncbi_dataset.zip",
+    output:
+        ncbi_dataset_sequences=temp("data/ncbi_dataset_sequences.fasta"),
+    # why benchmarks here but not elsewhere
+    benchmark:
+        "benchmarks/extract_ncbi_dataset_sequences.txt"
+    shell:
+        """
+        unzip -jp {input.dataset_package} \
+            ncbi_dataset/data/genomic.fna > {output.ncbi_dataset_sequences}
+        """
+
+
+rule format_ncbi_dataset_report:
+    input:
+        dataset_package="data/ncbi_dataset.zip",
+    output:
+        ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
+    params:
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
+    benchmark:
+        "benchmarks/format_ncbi_dataset_report.txt"
+    shell:
+        """
+        dataformat tsv virus-genome \
+            --package {input.dataset_package} \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk fix-quotes -Ht \
+            | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
+            | csvtk rename -t -f accession -n accession_version \
+            | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
+            | csvtk del-quotes -t \
+            | tsv-select -H -f accession --rest last \
+            > {output.ncbi_dataset_tsv}
+        """
+
+
+# Technically you can bypass this step and directly provide FASTA and TSV files
+# as input files for the curate pipeline.
+# We do the formatting here to have a uniform NDJSON file format for the raw
+# data that we host on data.nextstrain.org
+rule format_ncbi_datasets_ndjson:
+    input:
+        ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta",
+        ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
+    output:
+        ndjson="data/ncbi.ndjson",
+    log:
+        "logs/format_ncbi_datasets_ndjson.txt",
+    benchmark:
+        "benchmarks/format_ncbi_datasets_ndjson.txt"
+    shell:
+        """
+        augur curate passthru \
+            --metadata {input.ncbi_dataset_tsv} \
+            --fasta {input.ncbi_dataset_sequences} \
+            --seq-id-column accession_version \
+            --seq-field sequence \
+            --unmatched-reporting warn \
+            --duplicate-reporting warn \
+            2> {log} > {output.ndjson}
+        """
+
+
+###########################################################################
+########################## 2. Fetch from Entrez ###########################
+###########################################################################
+
+
+rule fetch_from_ncbi_entrez:
+    params:
+        term=config["entrez_search_term"],
+    output:
+        genbank="data/genbank.gb",
+    # Allow retries in case of network errors
+    retries: 5
+    benchmark:
+        "benchmarks/fetch_from_ncbi_entrez.txt"
+    shell:
+        """
+        vendored/fetch-from-ncbi-entrez \
+            --term {params.term:q} \
+            --output {output.genbank}
+        """
+
+
+rule parse_genbank_to_ndjson:
+    input:
+        genbank="data/genbank.gb",
+    output:
+        ndjson="data/ncbi.ndjson",
+    benchmark:
+        "benchmarks/parse_genbank_to_ndjson.txt"
+    shell:
+        """
+        # Add in custom script to parse needed fields from GenBank file to NDJSON file
+        """
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
new file mode 100644
index 0000000..ffbeab8
--- /dev/null
+++ b/ingest/rules/nextclade.smk
@@ -0,0 +1,97 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+
+REQUIRED INPUTS:
+
+    metadata    = data/subset_metadata.tsv
+    sequences   = results/sequences.fasta
+
+OUTPUTS:
+
+    metadata        = results/metadata.tsv
+    nextclade       = results/nextclade.tsv
+    alignment       = results/alignment.fasta
+    translations    = results/translations.zip
+
+See Nextclade docs for more details on usage, inputs, and outputs if you would
+like to customize the rules:
+https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
+"""
+
+DATASET_NAME = config["nextclade"]["dataset_name"]
+
+
+rule get_nextclade_dataset:
+    """Download Nextclade dataset"""
+    output:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+    params:
+        dataset_name=DATASET_NAME,
+    shell:
+        # should this get updated to `nextclade3`?
+        """
+        nextclade2 dataset get \
+            --name={params.dataset_name:q} \
+            --output-zip={output.dataset} \
+            --verbose
+        """
+
+
+rule run_nextclade:
+    input:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+        sequences="results/sequences.fasta",
+    output:
+        nextclade="results/nextclade.tsv",
+        alignment="results/alignment.fasta",
+        translations="results/translations.zip",
+    params:
+        # The lambda is used to deactivate automatic wildcard expansion.
+        # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
+        translations=lambda w: "results/translations/{gene}.fasta",
+    shell:
+        """
+        nextclade2 run \
+            {input.sequences} \
+            --input-dataset {input.dataset} \
+            --output-tsv {output.nextclade} \
+            --output-fasta {output.alignment} \
+            --output-translations {params.translations}
+
+        zip -rj {output.translations} results/translations
+        """
+
+
+rule join_metadata_and_nextclade:
+    input:
+        nextclade="results/nextclade.tsv",
+        metadata="data/subset_metadata.tsv",
+        nextclade_field_map=config["nextclade"]["field_map"],
+    output:
+        metadata="results/metadata.tsv",
+    params:
+        metadata_id_field=config["curate"]["output_id_field"],
+        nextclade_id_field=config["nextclade"]["id_field"],
+    shell:
+        """
+        export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
+
+        csvtk -tl cut -f $SUBSET_FIELDS \
+            {input.nextclade} \
+        | csvtk -tl rename2 \
+            -F \
+            -f '*' \
+            -p '(.+)' \
+            -r '{{kv}}' \
+            -k {input.nextclade_field_map} \
+        | tsv-join -H \
+            --filter-file - \
+            --key-fields {params.nextclade_id_field} \
+            --data-fields {params.metadata_id_field} \
+            --append-fields '*' \
+            --write-all ? \
+            {input.metadata} \
+        | tsv-select -H --exclude {params.nextclade_id_field} \
+            > {output.metadata}
+        """
diff --git a/nextstrain-pathogen.yaml b/nextstrain-pathogen.yaml
new file mode 100644
index 0000000..b74c50d
--- /dev/null
+++ b/nextstrain-pathogen.yaml
@@ -0,0 +1,5 @@
+# This is currently an empty file to indicate the top level pathogen repo.
+# The inclusion of this file allows the Nextstrain CLI to run the
+# `nextstrain build` from any directory regardless of runtime.
+#
+# See https://github.com/nextstrain/cli/releases/tag/8.2.0 for more details.
diff --git a/phylogenetic/README.md b/phylogenetic/README.md
new file mode 100644
index 0000000..7e961c8
--- /dev/null
+++ b/phylogenetic/README.md
@@ -0,0 +1,59 @@
+# Phylogenetic
+
+This workflow uses metadata and sequences to produce one or multiple [Nextstrain datasets][]
+that can be visualized in Auspice.
+
+## Workflow Usage
+
+The workflow can be run from the top level pathogen repo directory:
+```
+nextstrain build phylogenetic
+```
+
+Alternatively, the workflow can also be run from within the phylogenetic directory:
+```
+cd phylogenetic
+nextstrain build .
+```
+
+This produces the default outputs of the phylogenetic workflow:
+
+- auspice_json(s) = auspice/*.json
+
+## Data Requirements
+
+The core phylogenetic workflow will use metadata values as-is, so please do any
+desired data formatting and curations as part of the [ingest](../ingest/) workflow.
+
+1. The metadata must include an ID column that can be used as as exact match for
+   the sequence ID present in the FASTA headers.
+2. The `date` column in the metadata must be in ISO 8601 date format (i.e. YYYY-MM-DD).
+3. Ambiguous dates should be masked with `XX` (e.g. 2023-01-XX).
+
+## Defaults
+
+The defaults directory contains all of the default configurations for the phylogenetic workflow.
+
+[defaults/config.yaml](defaults/config.yaml) contains all of the default configuration parameters
+used for the phylogenetic workflow. Use Snakemake's `--configfile`/`--config`
+options to override these default values.
+
+## Snakefile and rules
+
+The rules directory contains separate Snakefiles (`*.smk`) as modules of the core phylogenetic workflow.
+The modules of the workflow are in separate files to keep the main phylogenetic [Snakefile](Snakefile) succinct and organized.
+
+The `workdir` is hardcoded to be the phylogenetic directory so all filepaths for
+inputs/outputs should be relative to the phylogenetic directory.
+
+Modules are all [included](https://snakemake.readthedocs.io/en/stable/snakefiles/modularization.html#includes)
+in the main Snakefile in the order that they are expected to run.
+
+## Build configs
+
+The build-configs directory contains custom configs and rules that override and/or
+extend the default workflow.
+
+- [ci](build-configs/ci/) - CI build that runs with example data
+
+[Nextstrain datasets]: https://docs.nextstrain.org/en/latest/reference/glossary.html#term-dataset
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
new file mode 100644
index 0000000..53eb471
--- /dev/null
+++ b/phylogenetic/Snakefile
@@ -0,0 +1,55 @@
+"""
+This is the main phylogenetic Snakefile that orchestrates the full phylogenetic
+workflow and defines its default output(s).
+"""
+
+
+# The workflow filepaths are written relative to this Snakefile's base directory
+workdir: workflow.current_basedir
+
+
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "defaults/config.yaml"
+
+
+# This is the default rule that Snakemake will run when there are no specified targets.
+# The default output of the phylogenetic workflow is usually the final
+# Nexstrain dataset(s) or Auspice JSON(s) that are output from `rules/export.smk`
+# See Nextstrain docs on expected naming conventions of dataset files
+# https://docs.nextstrain.org/page/reference/data-formats.html
+rule all:
+    input:
+        # Fill in path(s) to the final exported Auspice JSON(s)
+        auspice_json="",
+
+
+# These rules are imported in the order that they are expected to run.
+# Each Snakefile will have documented inputs and outputs that should be kept as
+# consistent interfaces across pathogen repos. This allows us to define typical
+# steps that are required for a phylogenetic workflow, but still allow pathogen
+# specific customizations within each step.
+# Note that only PATHOGEN level customizations should be added to these
+# core steps, meaning they are custom rules necessary for all builds of the pathogen.
+# If there are build specific customizations, they should be added with the
+# custom_rules imported below to ensure that the core workflow is not complicated
+# by build specific rules.
+include: "rules/prepare_sequences.smk"
+include: "rules/construct_phylogeny.smk"
+include: "rules/annotate_phylogeny.smk"
+include: "rules/export.smk"
+
+
+# Allow users to import custom rules provided via the config.
+# This allows users to run custom rules that can extend or override the workflow.
+# A concrete example of using custom rules is the extension of the workflow with
+# rules to support the Nextstrain automation that upload files and send internal
+# Slack notifications.
+# For extensions, the user will have to specify the custom rule targets when
+# running the workflow.
+# For overrides, the custom Snakefile will have to use the `ruleorder` directive
+# to allow Snakemake to handle ambiguous rules
+# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml
new file mode 100644
index 0000000..de89c67
--- /dev/null
+++ b/phylogenetic/build-configs/ci/config.yaml
@@ -0,0 +1,7 @@
+# This configuration file contains the custom configurations parameters
+# for the CI workflow to run with the example data.
+
+# Custom rules to run as part of the CI automated workflow
+# The paths should be relative to the phylogenetic directory.
+custom_rules:
+  - build-configs/ci/copy_example_data.smk
diff --git a/phylogenetic/build-configs/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk
new file mode 100644
index 0000000..8634488
--- /dev/null
+++ b/phylogenetic/build-configs/ci/copy_example_data.smk
@@ -0,0 +1,17 @@
+rule copy_example_data:
+    input:
+        sequences="example_data/sequences.fasta",
+        metadata="example_data/metadata.tsv",
+    output:
+        sequences="data/sequences.fasta",
+        metadata="data/metadata.tsv",
+    shell:
+        """
+        cp -f {input.sequences} {output.sequences}
+        cp -f {input.metadata} {output.metadata}
+        """
+
+
+# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules
+# that have the same output as the copy_example_data rule.
+# ruleorder: copy_example_data > ...
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
new file mode 100644
index 0000000..234fc62
--- /dev/null
+++ b/phylogenetic/defaults/config.yaml
@@ -0,0 +1,5 @@
+# This configuration file should contain all required configuration parameters
+# for the phylogenetic workflow to run to completion.
+#
+# Define optional config parameters with their default values here so that users
+# do not have to dig through the workflows to figure out the default values
diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv
new file mode 100644
index 0000000..e69de29
diff --git a/phylogenetic/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta
new file mode 100644
index 0000000..e69de29
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
new file mode 100644
index 0000000..9b9843e
--- /dev/null
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -0,0 +1,32 @@
+"""
+This part of the workflow creates additional annotations for the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata            = data/metadata.tsv
+    prepared_sequences  = results/prepared_sequences.fasta
+    tree                = results/tree.nwk
+
+OUTPUTS:
+
+    node_data = results/*.json
+
+    There are no required outputs for this part of the workflow as it depends
+    on which annotations are created. All outputs are expected to be node data
+    JSON files that can be fed into `augur export`.
+
+    See Nextstrain's data format docs for more details on node data JSONs:
+    https://docs.nextstrain.org/page/reference/data-formats.html
+
+This part of the workflow usually includes the following steps:
+
+    - augur traits
+    - augur ancestral
+    - augur translate
+    - augur clades
+
+See Augur's usage docs for these commands for more details.
+
+Custom node data files can also be produced by build-specific scripts in addition
+to the ones produced by Augur commands.
+"""
diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
new file mode 100644
index 0000000..7652005
--- /dev/null
+++ b/phylogenetic/rules/construct_phylogeny.smk
@@ -0,0 +1,20 @@
+"""
+This part of the workflow constructs the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata            = data/metadata.tsv
+    prepared_sequences  = results/prepared_sequences.fasta
+
+OUTPUTS:
+
+    tree            = results/tree.nwk
+    branch_lengths  = results/branch_lengths.json
+
+This part of the workflow usually includes the following steps:
+
+    - augur tree
+    - augur refine
+
+See Augur's usage docs for these commands for more details.
+"""
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
new file mode 100644
index 0000000..a273659
--- /dev/null
+++ b/phylogenetic/rules/export.smk
@@ -0,0 +1,26 @@
+"""
+This part of the workflow collects the phylogenetic tree and annotations to
+export a Nextstrain dataset.
+
+REQUIRED INPUTS:
+
+    metadata        = data/metadata.tsv
+    tree            = results/tree.nwk
+    branch_lengths  = results/branch_lengths.json
+    node_data       = results/*.json
+
+OUTPUTS:
+
+    auspice_json = auspice/${build_name}.json
+
+    There are optional sidecar JSON files that can be exported as part of the dataset.
+    See Nextstrain's data format docs for more details on sidecar files:
+    https://docs.nextstrain.org/page/reference/data-formats.html
+
+This part of the workflow usually includes the following steps:
+
+    - augur export v2
+    - augur frequencies
+
+See Augur's usage docs for these commands for more details.
+"""
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
new file mode 100644
index 0000000..c1c9e22
--- /dev/null
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -0,0 +1,22 @@
+"""
+This part of the workflow prepares sequences for constructing the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata    = data/metadata.tsv
+    sequences   = data/sequences.fasta
+    reference   = ../shared/reference.fasta
+
+OUTPUTS:
+
+    prepared_sequences = results/prepared_sequences.fasta
+
+This part of the workflow usually includes the following steps:
+
+    - augur index
+    - augur filter
+    - augur align
+    - augur mask
+
+See Augur's usage docs for these commands for more details.
+"""