Consistent serotype-genotype column names and auspice color titles #51

nextstrain · May 21, 2024 · 57b57d1 · 57b57d1
2 parents 074394d + f941515
commit 57b57d1
Show file tree

Hide file tree

Showing 8 changed files with 1,074 additions and 1,054 deletions.
diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -79,6 +79,8 @@ curate:
   local_geolocation_rules: 'defaults/geolocation-rules.tsv'
   # User annotations file
   annotations: 'defaults/annotations.tsv'
+  # Serotype field name inferred from NCBI Genbank annotation
+  serotype_field: 'serotype_genbank'
   # ID field used to merge annotations
   annotations_id: 'genbank_accession'
   # Field to use as the sequence ID in the FASTA file
@@ -99,11 +101,11 @@ curate:
     'host',
     'release_date',
     'update_date',
-    'ncbi_serotype', # inferred from virus_tax_id
     'sra_accessions',
     'abbr_authors',
     'authors',
-    'institution'
+    'institution',
+    'serotype_genbank', # inferred from virus_tax_id
   ]
 
 nextclade:
@@ -113,7 +115,7 @@ nextclade:
   # Nextclade Fields to rename to metadata field names.
   field_map:
     seqName: genbank_accession # ID field used to merge annotations
-    clade: nextclade_subtype
+    clade: genotype_nextclade
     alignmentStart: alignmentStart
     alignmentEnd: alignmentEnd
     coverage: genome_coverage

diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -66,6 +66,7 @@ rule curate:
         authors_default_value=config["curate"]["authors_default_value"],
         abbr_authors_field=config["curate"]["abbr_authors_field"],
         annotations_id=config["curate"]["annotations_id"],
+        serotype_field=config["curate"]["serotype_field"],
         metadata_columns=config["curate"]["metadata_columns"],
         id_field=config["curate"]["id_field"],
         sequence_field=config["curate"]["sequence_field"],
@@ -93,6 +94,7 @@ rule curate:
             | ./vendored/apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
             | ./bin/infer-dengue-serotype.py \
+                --out-col {params.serotype_field} \
             | ./vendored/merge-user-metadata \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \

diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -7,7 +7,7 @@ REQUIRED INPUTS:
     nextclade_datasets = ../nextclade_data/{serotype}
 OUTPUTS:
     metadata        = results/metadata_{serotype}.tsv
-    nextclade       = results/nextclade_subtypes.tsv
+    nextclade       = results/nextclade_genotypes.tsv
 See Nextclade docs for more details on usage, inputs, and outputs if you would
 like to customize the rules:
 https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
@@ -18,7 +18,7 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)
 
 rule nextclade_denvX:
     """
-    For each type, classify into the appropriate subtype
+    For each type, classify into the appropriate Dengue genotype
     1. Capture the alignment
     2. Capture the translations of gene(s) of interest
 
@@ -52,26 +52,26 @@ rule nextclade_denvX:
           {input.sequences}
         """
 
-rule concat_nextclade_subtype_results:
+rule concat_genotype_nextclade_results:
     """
-    Concatenate all the nextclade results for dengue subtype classification
+    Concatenate all the nextclade results for dengue genotype classification
     """
     input:
         nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
     output:
-        nextclade_subtypes="results/nextclade_subtypes.tsv",
+        genotype_nextclade="results/nextclade_genotypes.tsv",
     params:
         input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]),
         output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]),
     shell:
         """
         echo "{params.output_nextclade_fields}" \
         | tr ',' '\t' \
-        > {output.nextclade_subtypes}
+        > {output.genotype_nextclade}
 
         tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \
         | awk 'NR>1 {{print}}' \
-        >> {output.nextclade_subtypes}
+        >> {output.genotype_nextclade}
         """
 
 rule append_nextclade_columns:
@@ -80,7 +80,7 @@ rule append_nextclade_columns:
     """
     input:
         metadata="data/metadata_all.tsv",
-        nextclade_subtypes="results/nextclade_subtypes.tsv",
+        genotype_nextclade="results/nextclade_genotypes.tsv",
     output:
         metadata_all="data/metadata_nextclade.tsv",
     params:
@@ -89,7 +89,7 @@ rule append_nextclade_columns:
     shell:
         """
         tsv-join -H \
-            --filter-file {input.nextclade_subtypes} \
+            --filter-file {input.genotype_nextclade} \
             --key-fields {params.id_field} \
             --append-fields {params.output_nextclade_fields} \
             --write-all ? \
@@ -164,7 +164,9 @@ rule split_metadata_by_serotype:
         serotype_metadata="results/metadata_{serotype}.tsv"
     wildcard_constraints:
         serotype=SEROTYPE_CONSTRAINTS
+    params:
+        serotype_field=config["curate"]["serotype_field"],
     shell:
         """
-        tsv-filter -H --str-eq ncbi_serotype:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
+        tsv-filter -H --str-eq {params.serotype_field}:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
         """
diff --git a/ingest/rules/split_serotypes.smk b/ingest/rules/split_serotypes.smk
@@ -12,23 +12,24 @@ This will produce output files as
 Parameters are expected to be defined in `config.curate`.
 """
 
-rule split_by_ncbi_serotype:
+rule split_by_serotype_genbank:
     """
-    Split the data by serotype based on the NCBI metadata.
+    Split the data by serotype based on the NCBI Genbank metadata.
     """
     input:
         metadata = "data/metadata_all.tsv",
         sequences = "results/sequences_all.fasta"
     output:
         sequences = "results/sequences_{serotype}.fasta"
     params:
-        id_field = config["curate"]["id_field"]
+        id_field = config["curate"]["id_field"],
+        serotype_field = config["curate"]["serotype_field"]
     shell:
         """
         augur filter \
           --sequences {input.sequences} \
           --metadata {input.metadata} \
           --metadata-id-columns {params.id_field} \
-          --query "ncbi_serotype=='{wildcards.serotype}'" \
+          --query "{params.serotype_field}=='{wildcards.serotype}'" \
           --output-sequences {output.sequences}
         """
diff --git a/phylogenetic/config/color_orderings.tsv b/phylogenetic/config/color_orderings.tsv
@@ -235,28 +235,28 @@ recency	New
 
 ################
 
-ncbi_serotype	denv1
-ncbi_serotype	denv2
-ncbi_serotype	denv3
-ncbi_serotype	denv4
+serotype_genbank	denv1
+serotype_genbank	denv2
+serotype_genbank	denv3
+serotype_genbank	denv4
 
-nextclade_subtype	DENV1/I
-nextclade_subtype	DENV1/II
-nextclade_subtype	DENV1/III
-nextclade_subtype	DENV1/IV
-nextclade_subtype	DENV1/V
-nextclade_subtype	DENV2/AA
-nextclade_subtype	DENV2/AI
-nextclade_subtype	DENV2/AII
-nextclade_subtype	DENV2/AM
-nextclade_subtype	DENV2/C
-nextclade_subtype	DENV2/S
-nextclade_subtype	DENV3/I
-nextclade_subtype	DENV3/II
-nextclade_subtype	DENV3/III
-nextclade_subtype	DENV3/IV
-nextclade_subtype	DENV4/I
-nextclade_subtype	DENV4/II
-nextclade_subtype	DENV4/S
+genotype_nextclade	DENV1/I
+genotype_nextclade	DENV1/II
+genotype_nextclade	DENV1/III
+genotype_nextclade	DENV1/IV
+genotype_nextclade	DENV1/V
+genotype_nextclade	DENV2/AA
+genotype_nextclade	DENV2/AI
+genotype_nextclade	DENV2/AII
+genotype_nextclade	DENV2/AM
+genotype_nextclade	DENV2/C
+genotype_nextclade	DENV2/S
+genotype_nextclade	DENV3/I
+genotype_nextclade	DENV3/II
+genotype_nextclade	DENV3/III
+genotype_nextclade	DENV3/IV
+genotype_nextclade	DENV4/I
+genotype_nextclade	DENV4/II
+genotype_nextclade	DENV4/S
 
 ################
diff --git a/phylogenetic/config/config_dengue.yaml b/phylogenetic/config/config_dengue.yaml
@@ -23,11 +23,11 @@ filter:
 traits:
   sampling_bias_correction: '3'
   traits_columns:
-    all: 'region ncbi_serotype nextclade_subtype'
-    denv1: 'country region ncbi_serotype nextclade_subtype'
-    denv2: 'country region ncbi_serotype nextclade_subtype'
-    denv3: 'country region ncbi_serotype nextclade_subtype'
-    denv4: 'country region ncbi_serotype nextclade_subtype'
+    all: 'region serotype_genbank genotype_nextclade'
+    denv1: 'country region serotype_genbank genotype_nextclade'
+    denv2: 'country region serotype_genbank genotype_nextclade'
+    denv3: 'country region serotype_genbank genotype_nextclade'
+    denv4: 'country region serotype_genbank genotype_nextclade'
 
 clades:
   clade_definitions:

diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
@@ -42,8 +42,8 @@ rule prepare_auspice_config:
     output:
         auspice_config="results/config/{gene}/auspice_config_{serotype}.json",
     params:
-        replace_clade_key=lambda wildcard: r"clade_membership" if wildcard.gene in ['genome'] else r"nextclade_subtype",
-        replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"DENV genotype",
+        replace_clade_key=lambda wildcard: r"clade_membership" if wildcard.gene in ['genome'] else r"genotype_nextclade",
+        replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"Dengue Genotype (Nextclade)",
     run:
         data = {
             "title": "Real-time tracking of dengue virus evolution",
@@ -73,18 +73,13 @@ rule prepare_auspice_config:
                 "type": "categorical"
               },
               {
-                "key": params.replace_clade_key,
-                "title": params.replace_clade_title,
+                "key": "genotype_nextclade",
+                "title": "Dengue Genotype (Nextclade)",
                 "type": "categorical"
               },
               {
-                "key": "nextclade_subtype",
-                "title": "Nextclade genotype",
-                "type": "categorical"
-              },
-              {
-                "key": "ncbi_serotype",
-                "title": "NCBI serotype",
+                "key": "serotype_genbank",
+                "title": "Serotype (Genbank metadata)",
                 "type": "categorical"
               }
             ],
@@ -106,6 +101,24 @@ rule prepare_auspice_config:
             ]
           }
 
+        # During genome/dengue_all workflows, clade membership represents Serotype
+        # While genome/dengue_denvX workflows, clade_membership represents the more detailed Genotype
+        if params.replace_clade_key == 'clade_membership':
+            if wildcards.gene in ['genome'] and wildcards.serotype in ['all']:
+                clade_membership_title="Serotype (Nextstrain)"
+            else:
+                clade_membership_title="Dengue Genotype (Nextstrain)"
+
+            data["colorings"].append({
+                "key": "clade_membership",
+                "title": clade_membership_title,
+                "type": "categorical"
+            })
+        else:
+            # During E/dengue_all workflows, default color by Serotype
+            if wildcards.serotype in ['all']:
+                data["display_defaults"]["color_by"]="serotype_genbank"
+
         with open(output.auspice_config, 'w') as fh:
             json.dump(data, fh, indent=2)