From 3f9888efbd35509a6c87e344d7c6244cf8bb7f45 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 Oct 2023 16:46:10 -0700 Subject: [PATCH] ingest/nextclade: Add join of metadata and Nextclade outputs The shell script for joining the metadata and Nextclade outputs is taken from @j23414's work in https://github.com/nextstrain/monkeypox/pull/207 Co-authored-by: Jennifer Chang --- ingest/config/defaults.yaml | 3 +++ ingest/config/nextclade_column_map.tsv | 18 +++++++++++++++ ingest/rules/nextclade.smk | 32 ++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 ingest/config/nextclade_column_map.tsv diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index f3a9b7e..e97851a 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -82,3 +82,6 @@ nextclade: # The name of the Nextclade dataset to use for running nextclade. # Run `nextclade dataset list` to get a full list of available Nextclade datasets dataset_name: "" + # Path to the mapping for renaming Nextclade output columns + # The path should be relative to the ingest directory + column_map: "config/nextclade_column_map.tsv" diff --git a/ingest/config/nextclade_column_map.tsv b/ingest/config/nextclade_column_map.tsv new file mode 100644 index 0000000..513b0fd --- /dev/null +++ b/ingest/config/nextclade_column_map.tsv @@ -0,0 +1,18 @@ +# TSV file that is a mapping of column names for Nextclade output TSV +# The first column should be the original column name of the Nextclade TSV +# The second column should be the new column name to use in the final metadata TSV +# Nextclade can have pathogen specific output columns so make sure to check which +# columns would be useful for your downstream phylogenetic analysis. +seqName seqName +clade clade +lineage lineage +coverage coverage +totalMissing missing_data +totalSubstitutions divergence +totalNonACGTNs nonACGTN +qc.missingData.status QC_missing_data +qc.mixedSites.status QC_mixed_sites +qc.privateMutations.status QC_rare_mutations +qc.frameShifts.status QC_frame_shifts +qc.stopCodons.status QC_stop_codons +frameShifts frame_shifts diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 3a9b1ef..e90a680 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -48,3 +48,35 @@ rule run_nextclade: zip -rj {output.translations} results/translations """ + +rule join_metadata_and_nextclade: + input: + nextclade="results/nextclade.tsv", + metadata="results/subset_metadata.tsv", + nextclade_field_map=config["nextclade"]["column_map"], + output: + metadata="results/metadata.tsv", + params: + metadata_id_field=config["curate"]["output_id_field"], + shell: + """ + export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'` + + csvtk -tl cut -f $SUBSET_FIELDS \ + {input.nextclade} \ + | csvtk -tl rename2 \ + -F \ + -f '*' \ + -p '(.+)' \ + -r '{{kv}}' \ + -k {input.nextclade_field_map} \ + | tsv-join -H \ + --filter-file - \ + --key-fields seqName \ + --data-fields {params.metadata_id_field} \ + --append-fields '*' \ + --write-all ? \ + {input.metadata} \ + | tsv-select -H --exclude seqName \ + > {output.metadata} + """