From 1140955fae354b87b3c92e2d5a8191898a6f1312 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Mon, 27 May 2024 16:37:51 +1200 Subject: [PATCH 1/3] Add Clade-I build Clock rate obtained by using TimeTree on a clade-I dataset excluding clade-Ib samples. (The inferred rate when including clade-Ib samples was 1.46E-6, so around 10% faster.) As more data become publicly available we should update this. Clade Ib label added after correspondence with INRB, DRC --- .../defaults/clade-i/auspice_config.json | 70 +++++++++++++++++++ phylogenetic/defaults/clade-i/config.yaml | 58 +++++++++++++++ phylogenetic/defaults/clade-i/include.txt | 0 phylogenetic/defaults/clades.tsv | 3 + phylogenetic/rules/prepare_sequences.smk | 6 ++ 5 files changed, 137 insertions(+) create mode 100644 phylogenetic/defaults/clade-i/auspice_config.json create mode 100644 phylogenetic/defaults/clade-i/config.yaml create mode 100644 phylogenetic/defaults/clade-i/include.txt diff --git a/phylogenetic/defaults/clade-i/auspice_config.json b/phylogenetic/defaults/clade-i/auspice_config.json new file mode 100644 index 00000000..a924efca --- /dev/null +++ b/phylogenetic/defaults/clade-i/auspice_config.json @@ -0,0 +1,70 @@ +{ + "title": "Genomic epidemiology of mpox clade I viruses", + "maintainers": [ + {"name": "Nextstrain team", "url": "http://nextstrain.org"} + ], + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], + "build_url": "https://github.com/nextstrain/mpox", + "colorings": [ + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "GA_CT_fraction", + "title": "G→A or C→T fraction", + "type": "continuous" + }, + { + "key": "dinuc_context_fraction", + "title": "NGA/TCN context of G→A/C→T mutations", + "type": "continuous" + }, + { + "key": "recency", + "title": "Submission Recency", + "type": "categorical" + }, + { + "key": "date_submitted", + "title": "Release Date", + "type": "categorical" + }, + { + "key": "date", + "title": "Collection date", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country" + ], + "display_defaults": { + "color_by": "country", + "map_triplicate": true, + "distance_measure": "num_date", + "transmission_lines": false + }, + "filters": [ + "country", + "region", + "recency", + "host" + ] +} diff --git a/phylogenetic/defaults/clade-i/config.yaml b/phylogenetic/defaults/clade-i/config.yaml new file mode 100644 index 00000000..e8ad8504 --- /dev/null +++ b/phylogenetic/defaults/clade-i/config.yaml @@ -0,0 +1,58 @@ +reference: "defaults/reference.fasta" +genome_annotation: "defaults/genome_annotation.gff3" +genbank_reference: "defaults/reference.gb" +include: "defaults/clade-i/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/clade-i/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" + +# Use `accession` as the ID column since `strain` currently contains duplicates¹. +# ¹ https://github.com/nextstrain/mpox/issues/33 +strain_id_field: "accession" +display_strain_field: "strain" + +build_name: "clade-i" +auspice_name: "mpox_clade-I" + +filter: + min_date: 1900 + min_length: 100000 + exclude_where: 'clade!=I' + + +### We don't want to subsample, so specify a config which is essentially a no-op +subsample: + everything: + group_by: "" + sequences_per_group: "" + +## align +max_indel: 10000 +seed_spacing: 1000 + +## treefix +fix_tree: true +treefix_root: "" # without a root we'll midpoint root which should work great for clade I + +## refine +timetree: true +root: "best" +# Clock rate chosen via treetime inference on Clade-I data excluding Clade-Ib seqs (n=73) +# TODO: update this once more public data is available. +clock_rate: 1.465e-06 +clock_std_dev: 6.7e-07 +divergence_units: "mutations" + +traits: + columns: "country" + sampling_bias_correction: 3 + +## recency +recency: true + +mask: + from_beginning: 800 + from_end: 6422 + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/defaults/clade-i/include.txt b/phylogenetic/defaults/clade-i/include.txt new file mode 100644 index 00000000..e69de29b diff --git a/phylogenetic/defaults/clades.tsv b/phylogenetic/defaults/clades.tsv index d35cfbb8..61ffa503 100644 --- a/phylogenetic/defaults/clades.tsv +++ b/phylogenetic/defaults/clades.tsv @@ -5,6 +5,9 @@ outgroup nuc 179226 T clade I nuc 87560 T clade I nuc 136015 A +clade Ib nuc 6014 G +clade Ib nuc 108966 T + clade II nuc 86502 G clade II nuc 150970 A clade II nuc 35352 C diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index c9ab5012..4916379a 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -65,6 +65,11 @@ rule filter: min_date=config["filter"]["min_date"], min_length=config["filter"]["min_length"], strain_id=config["strain_id_field"], + exclude_where=lambda w: ( + f"--exclude-where {config['filter']['exclude_where']}" + if "exclude_where" in config["filter"] + else "" + ), shell: """ augur filter \ @@ -74,6 +79,7 @@ rule filter: --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ --exclude {input.exclude} \ + {params.exclude_where} \ --min-date {params.min_date} \ --min-length {params.min_length} \ --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \ From ec5aa8bb2fe7b56ade11a6f9d566a7955f8c587b Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 18 Jun 2024 11:23:31 +1200 Subject: [PATCH 2/3] Add Clade-I GitHub action Following the hmpxv1 action, but using the docker runtime as the size of the dataset allows us to rerun it on the action runner directly. --- .github/workflows/rebuild-clade-i.yaml | 59 ++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/rebuild-clade-i.yaml diff --git a/.github/workflows/rebuild-clade-i.yaml b/.github/workflows/rebuild-clade-i.yaml new file mode 100644 index 00000000..5d17e8b5 --- /dev/null +++ b/.github/workflows/rebuild-clade-i.yaml @@ -0,0 +1,59 @@ +name: Rebuild clade-I + +on: + repository_dispatch: + types: + - rebuild + - rebuild_clade-i + + workflow_dispatch: + inputs: + trial_name: + description: "If set, result will be at nextstrain.org/staging/trial/${trial_name}/${auspice_name}" + required: false + +jobs: + set_config_overrides: + runs-on: ubuntu-latest + steps: + - id: config + name: Set config overrides + env: + TRIAL_NAME: ${{ inputs.trial_name }} + run: | + config="" + if [[ "$TRIAL_NAME" ]]; then + config+="--config" + config+=" deploy_url='s3://nextstrain-staging/'" + config+=" auspice_prefix='"$TRIAL_NAME"'" + fi + + echo "config=$config" >> "$GITHUB_OUTPUT" + outputs: + config_overrides: ${{ steps.config.outputs.config }} + + rebuild_clade_i: + needs: [set_config_overrides] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # We can migrate to AWS Batch when/if we need to for more resources, + # but at the time of writing the clade-I build is small & quick + runtime: docker + env: | + CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }} + GITHUB_RUN_ID: ${{ github.run_id }} + SLACK_CHANNELS: ${{ inputs.trial_name && vars.TEST_SLACK_CHANNEL || vars.SLACK_CHANNELS }} + BUILD_DIR: phylogenetic + BUILD_NAME: clade-i + run: | + nextstrain build \ + --env GITHUB_RUN_ID \ + --env SLACK_TOKEN \ + --env SLACK_CHANNELS \ + . \ + notify_on_deploy \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ + $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile From 6f92ff9e7e3d7c638da03313d7a48ccffa0c6b67 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Fri, 21 Jun 2024 13:37:19 +1200 Subject: [PATCH 3/3] add clade-I to readme --- phylogenetic/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 865c361b..ea426f03 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -80,6 +80,12 @@ Run pipeline to produce the "lineage B.1" tree for `/mpox/lineage-B.1` with: nextstrain build . --configfile defaults/hmpxv1_big/config.yaml ``` +Run pipeline to produce the "clade I" tree for `/mpox/clade-I` with: + +```bash +nextstrain build . --configfile defaults/clade-i/config.yaml +``` + ### Visualize results View results with: