From 3f9888efbd35509a6c87e344d7c6244cf8bb7f45 Mon Sep 17 00:00:00 2001
From: Jover <joverlee521@gmail.com>
Date: Fri, 6 Oct 2023 16:46:10 -0700
Subject: [PATCH] ingest/nextclade: Add join of metadata and Nextclade outputs

The shell script for joining the metadata and Nextclade outputs is taken
from @j23414's work in https://github.com/nextstrain/monkeypox/pull/207

Co-authored-by: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
---
 ingest/config/defaults.yaml            |  3 +++
 ingest/config/nextclade_column_map.tsv | 18 +++++++++++++++
 ingest/rules/nextclade.smk             | 32 ++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 ingest/config/nextclade_column_map.tsv

diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
index f3a9b7e..e97851a 100644
--- a/ingest/config/defaults.yaml
+++ b/ingest/config/defaults.yaml
@@ -82,3 +82,6 @@ nextclade:
   # The name of the Nextclade dataset to use for running nextclade.
   # Run `nextclade dataset list` to get a full list of available Nextclade datasets
   dataset_name: ""
+  # Path to the mapping for renaming Nextclade output columns
+  # The path should be relative to the ingest directory
+  column_map: "config/nextclade_column_map.tsv"
diff --git a/ingest/config/nextclade_column_map.tsv b/ingest/config/nextclade_column_map.tsv
new file mode 100644
index 0000000..513b0fd
--- /dev/null
+++ b/ingest/config/nextclade_column_map.tsv
@@ -0,0 +1,18 @@
+# TSV file that is a mapping of column names for Nextclade output TSV
+# The first column should be the original column name of the Nextclade TSV
+# The second column should be the new column name to use in the final metadata TSV
+# Nextclade can have pathogen specific output columns so make sure to check which
+# columns would be useful for your downstream phylogenetic analysis.
+seqName	seqName
+clade	clade
+lineage	lineage
+coverage	coverage
+totalMissing	missing_data
+totalSubstitutions	divergence
+totalNonACGTNs	nonACGTN
+qc.missingData.status	QC_missing_data
+qc.mixedSites.status	QC_mixed_sites
+qc.privateMutations.status	QC_rare_mutations
+qc.frameShifts.status	QC_frame_shifts
+qc.stopCodons.status	QC_stop_codons
+frameShifts	frame_shifts
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
index 3a9b1ef..e90a680 100644
--- a/ingest/rules/nextclade.smk
+++ b/ingest/rules/nextclade.smk
@@ -48,3 +48,35 @@ rule run_nextclade:
         zip -rj {output.translations} results/translations
         """
 
+
+rule join_metadata_and_nextclade:
+    input:
+        nextclade="results/nextclade.tsv",
+        metadata="results/subset_metadata.tsv",
+        nextclade_field_map=config["nextclade"]["column_map"],
+    output:
+        metadata="results/metadata.tsv",
+    params:
+        metadata_id_field=config["curate"]["output_id_field"],
+    shell:
+        """
+        export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'`
+
+        csvtk -tl cut -f $SUBSET_FIELDS \
+            {input.nextclade} \
+        | csvtk -tl rename2 \
+            -F \
+            -f '*' \
+            -p '(.+)' \
+            -r '{{kv}}' \
+            -k {input.nextclade_field_map} \
+        | tsv-join -H \
+            --filter-file - \
+            --key-fields seqName \
+            --data-fields {params.metadata_id_field} \
+            --append-fields '*' \
+            --write-all ? \
+            {input.metadata} \
+        | tsv-select -H --exclude seqName \
+            > {output.metadata}
+        """