Skip to content

Commit

Permalink
Consistent serotype-genotype column names and auspice color titles #51
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored May 21, 2024
2 parents 074394d + f941515 commit 57b57d1
Show file tree
Hide file tree
Showing 8 changed files with 1,074 additions and 1,054 deletions.
1,998 changes: 999 additions & 999 deletions ingest/defaults/annotations.tsv

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ curate:
local_geolocation_rules: 'defaults/geolocation-rules.tsv'
# User annotations file
annotations: 'defaults/annotations.tsv'
# Serotype field name inferred from NCBI Genbank annotation
serotype_field: 'serotype_genbank'
# ID field used to merge annotations
annotations_id: 'genbank_accession'
# Field to use as the sequence ID in the FASTA file
Expand All @@ -99,11 +101,11 @@ curate:
'host',
'release_date',
'update_date',
'ncbi_serotype', # inferred from virus_tax_id
'sra_accessions',
'abbr_authors',
'authors',
'institution'
'institution',
'serotype_genbank', # inferred from virus_tax_id
]

nextclade:
Expand All @@ -113,7 +115,7 @@ nextclade:
# Nextclade Fields to rename to metadata field names.
field_map:
seqName: genbank_accession # ID field used to merge annotations
clade: nextclade_subtype
clade: genotype_nextclade
alignmentStart: alignmentStart
alignmentEnd: alignmentEnd
coverage: genome_coverage
Expand Down
2 changes: 2 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ rule curate:
authors_default_value=config["curate"]["authors_default_value"],
abbr_authors_field=config["curate"]["abbr_authors_field"],
annotations_id=config["curate"]["annotations_id"],
serotype_field=config["curate"]["serotype_field"],
metadata_columns=config["curate"]["metadata_columns"],
id_field=config["curate"]["id_field"],
sequence_field=config["curate"]["sequence_field"],
Expand Down Expand Up @@ -93,6 +94,7 @@ rule curate:
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/infer-dengue-serotype.py \
--out-col {params.serotype_field} \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down
22 changes: 12 additions & 10 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ REQUIRED INPUTS:
nextclade_datasets = ../nextclade_data/{serotype}
OUTPUTS:
metadata = results/metadata_{serotype}.tsv
nextclade = results/nextclade_subtypes.tsv
nextclade = results/nextclade_genotypes.tsv
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
Expand All @@ -18,7 +18,7 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES)

rule nextclade_denvX:
"""
For each type, classify into the appropriate subtype
For each type, classify into the appropriate Dengue genotype
1. Capture the alignment
2. Capture the translations of gene(s) of interest
Expand Down Expand Up @@ -52,26 +52,26 @@ rule nextclade_denvX:
{input.sequences}
"""

rule concat_nextclade_subtype_results:
rule concat_genotype_nextclade_results:
"""
Concatenate all the nextclade results for dengue subtype classification
Concatenate all the nextclade results for dengue genotype classification
"""
input:
nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES),
output:
nextclade_subtypes="results/nextclade_subtypes.tsv",
genotype_nextclade="results/nextclade_genotypes.tsv",
params:
input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]),
output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]),
shell:
"""
echo "{params.output_nextclade_fields}" \
| tr ',' '\t' \
> {output.nextclade_subtypes}
> {output.genotype_nextclade}
tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \
| awk 'NR>1 {{print}}' \
>> {output.nextclade_subtypes}
>> {output.genotype_nextclade}
"""

rule append_nextclade_columns:
Expand All @@ -80,7 +80,7 @@ rule append_nextclade_columns:
"""
input:
metadata="data/metadata_all.tsv",
nextclade_subtypes="results/nextclade_subtypes.tsv",
genotype_nextclade="results/nextclade_genotypes.tsv",
output:
metadata_all="data/metadata_nextclade.tsv",
params:
Expand All @@ -89,7 +89,7 @@ rule append_nextclade_columns:
shell:
"""
tsv-join -H \
--filter-file {input.nextclade_subtypes} \
--filter-file {input.genotype_nextclade} \
--key-fields {params.id_field} \
--append-fields {params.output_nextclade_fields} \
--write-all ? \
Expand Down Expand Up @@ -164,7 +164,9 @@ rule split_metadata_by_serotype:
serotype_metadata="results/metadata_{serotype}.tsv"
wildcard_constraints:
serotype=SEROTYPE_CONSTRAINTS
params:
serotype_field=config["curate"]["serotype_field"],
shell:
"""
tsv-filter -H --str-eq ncbi_serotype:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
tsv-filter -H --str-eq {params.serotype_field}:{wildcards.serotype} {input.metadata} > {output.serotype_metadata}
"""
9 changes: 5 additions & 4 deletions ingest/rules/split_serotypes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,24 @@ This will produce output files as
Parameters are expected to be defined in `config.curate`.
"""

rule split_by_ncbi_serotype:
rule split_by_serotype_genbank:
"""
Split the data by serotype based on the NCBI metadata.
Split the data by serotype based on the NCBI Genbank metadata.
"""
input:
metadata = "data/metadata_all.tsv",
sequences = "results/sequences_all.fasta"
output:
sequences = "results/sequences_{serotype}.fasta"
params:
id_field = config["curate"]["id_field"]
id_field = config["curate"]["id_field"],
serotype_field = config["curate"]["serotype_field"]
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.id_field} \
--query "ncbi_serotype=='{wildcards.serotype}'" \
--query "{params.serotype_field}=='{wildcards.serotype}'" \
--output-sequences {output.sequences}
"""
44 changes: 22 additions & 22 deletions phylogenetic/config/color_orderings.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -235,28 +235,28 @@ recency New

################

ncbi_serotype denv1
ncbi_serotype denv2
ncbi_serotype denv3
ncbi_serotype denv4
serotype_genbank denv1
serotype_genbank denv2
serotype_genbank denv3
serotype_genbank denv4

nextclade_subtype DENV1/I
nextclade_subtype DENV1/II
nextclade_subtype DENV1/III
nextclade_subtype DENV1/IV
nextclade_subtype DENV1/V
nextclade_subtype DENV2/AA
nextclade_subtype DENV2/AI
nextclade_subtype DENV2/AII
nextclade_subtype DENV2/AM
nextclade_subtype DENV2/C
nextclade_subtype DENV2/S
nextclade_subtype DENV3/I
nextclade_subtype DENV3/II
nextclade_subtype DENV3/III
nextclade_subtype DENV3/IV
nextclade_subtype DENV4/I
nextclade_subtype DENV4/II
nextclade_subtype DENV4/S
genotype_nextclade DENV1/I
genotype_nextclade DENV1/II
genotype_nextclade DENV1/III
genotype_nextclade DENV1/IV
genotype_nextclade DENV1/V
genotype_nextclade DENV2/AA
genotype_nextclade DENV2/AI
genotype_nextclade DENV2/AII
genotype_nextclade DENV2/AM
genotype_nextclade DENV2/C
genotype_nextclade DENV2/S
genotype_nextclade DENV3/I
genotype_nextclade DENV3/II
genotype_nextclade DENV3/III
genotype_nextclade DENV3/IV
genotype_nextclade DENV4/I
genotype_nextclade DENV4/II
genotype_nextclade DENV4/S

################
10 changes: 5 additions & 5 deletions phylogenetic/config/config_dengue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ filter:
traits:
sampling_bias_correction: '3'
traits_columns:
all: 'region ncbi_serotype nextclade_subtype'
denv1: 'country region ncbi_serotype nextclade_subtype'
denv2: 'country region ncbi_serotype nextclade_subtype'
denv3: 'country region ncbi_serotype nextclade_subtype'
denv4: 'country region ncbi_serotype nextclade_subtype'
all: 'region serotype_genbank genotype_nextclade'
denv1: 'country region serotype_genbank genotype_nextclade'
denv2: 'country region serotype_genbank genotype_nextclade'
denv3: 'country region serotype_genbank genotype_nextclade'
denv4: 'country region serotype_genbank genotype_nextclade'

clades:
clade_definitions:
Expand Down
35 changes: 24 additions & 11 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ rule prepare_auspice_config:
output:
auspice_config="results/config/{gene}/auspice_config_{serotype}.json",
params:
replace_clade_key=lambda wildcard: r"clade_membership" if wildcard.gene in ['genome'] else r"nextclade_subtype",
replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"DENV genotype",
replace_clade_key=lambda wildcard: r"clade_membership" if wildcard.gene in ['genome'] else r"genotype_nextclade",
replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"Dengue Genotype (Nextclade)",
run:
data = {
"title": "Real-time tracking of dengue virus evolution",
Expand Down Expand Up @@ -73,18 +73,13 @@ rule prepare_auspice_config:
"type": "categorical"
},
{
"key": params.replace_clade_key,
"title": params.replace_clade_title,
"key": "genotype_nextclade",
"title": "Dengue Genotype (Nextclade)",
"type": "categorical"
},
{
"key": "nextclade_subtype",
"title": "Nextclade genotype",
"type": "categorical"
},
{
"key": "ncbi_serotype",
"title": "NCBI serotype",
"key": "serotype_genbank",
"title": "Serotype (Genbank metadata)",
"type": "categorical"
}
],
Expand All @@ -106,6 +101,24 @@ rule prepare_auspice_config:
]
}

# During genome/dengue_all workflows, clade membership represents Serotype
# While genome/dengue_denvX workflows, clade_membership represents the more detailed Genotype
if params.replace_clade_key == 'clade_membership':
if wildcards.gene in ['genome'] and wildcards.serotype in ['all']:
clade_membership_title="Serotype (Nextstrain)"
else:
clade_membership_title="Dengue Genotype (Nextstrain)"

data["colorings"].append({
"key": "clade_membership",
"title": clade_membership_title,
"type": "categorical"
})
else:
# During E/dengue_all workflows, default color by Serotype
if wildcards.serotype in ['all']:
data["display_defaults"]["color_by"]="serotype_genbank"

with open(output.auspice_config, 'w') as fh:
json.dump(data, fh, indent=2)

Expand Down

0 comments on commit 57b57d1

Please sign in to comment.