From d053086eab6c3aef5101bf3f1a25609398500a3e Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 4 Oct 2023 14:45:10 -0700
Subject: [PATCH 1/2] Replace join metadata and clades script with csvtk and
 tsv append

As part of centralizing ingest scripts, replace the join-metadata-and-clades.py
script with csvtk and tsv append when there aren't any customized calculations.

https://github.com/nextstrain/ingest/pull/23

Relatedly, this commit also adds a nextclade config section where mapping
fields from the nextclade output to be appended to the metadata can be specified.

Co-authored-by: Jover Lee <joverlee521@gmail.com>
---
 ingest/bin/join-metadata-and-clades.py        | 77 -------------------
 ingest/config/config.yaml                     |  7 ++
 ingest/source-data/nextclade-field-map.tsv    | 16 ++++
 ingest/workflow/snakemake_rules/nextclade.smk | 28 +++++--
 4 files changed, 45 insertions(+), 83 deletions(-)
 delete mode 100755 ingest/bin/join-metadata-and-clades.py
 create mode 100644 ingest/source-data/nextclade-field-map.tsv

diff --git a/ingest/bin/join-metadata-and-clades.py b/ingest/bin/join-metadata-and-clades.py
deleted file mode 100755
index 3a0e919e..00000000
--- a/ingest/bin/join-metadata-and-clades.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import re
-import sys
-import pandas as pd
-
-NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
-VALUE_MISSING_DATA = '?'
-
-column_map = {
-    "clade": "clade",
-    "outbreak": "outbreak",
-    "lineage": "lineage",
-    "coverage": "coverage",
-    "totalMissing": "missing_data",
-    "totalSubstitutions": "divergence",
-    "totalNonACGTNs": "nonACGTN",
-    "qc.missingData.status": "QC_missing_data",
-    "qc.mixedSites.status": "QC_mixed_sites",
-    "qc.privateMutations.status": "QC_rare_mutations",
-    "qc.frameShifts.status": "QC_frame_shifts",
-    "qc.stopCodons.status": "QC_stop_codons",
-    "frameShifts": "frame_shifts",
-    "isReverseComplement": "is_reverse_complement",
-#    "deletions": "deletions",
-#    "insertions": "insertions"
-#    "substitutions": "substitutions",
-#    "aaSubstitutions": "aaSubstitutions"
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Joins metadata file with Nextclade clade output",
-    )
-    parser.add_argument("--metadata")
-    parser.add_argument("--nextclade")
-    parser.add_argument("--id-field")
-    parser.add_argument("-o", default=sys.stdout)
-    return parser.parse_args()
-
-def main():
-    args = parse_args()
-
-    metadata = pd.read_csv(args.metadata, index_col=args.id_field,
-                           sep='\t', low_memory=False, na_filter = False)
-
-    # Read and rename clade column to be more descriptive
-    clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
-                         sep='\t', low_memory=False, na_filter = False) \
-            .rename(columns=column_map)
-
-    clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x))
-
-    # Select columns in column map
-    clades = clades[list(column_map.values())]
-
-    # Separate long from short columns
-    short_metadata = metadata.iloc[:,:-2].copy()
-    long_metadata = metadata.iloc[:,-2:].copy()
-
-    # Concatenate on columns
-    result = pd.merge(
-        short_metadata, clades,
-        left_index=True,
-        right_index=True,
-        how='left'
-    )
-
-    # Add long columns to back
-    result = pd.concat([result, long_metadata], axis=1)
-
-    result.to_csv(args.o, index_label=args.id_field, sep='\t')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index bae5fc9e..a8e26f70 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -66,3 +66,10 @@ transform:
     'authors',
     'institution'
   ]
+
+# Params for Nextclade related rules
+nextclade:
+  # Field to use as the sequence ID in the Nextclade file
+  id_field: 'seqName'
+  # Fields from a Nextclade file to be renamed (if desired) and appended to a metadata file
+  field_map: 'source-data/nextclade-field-map.tsv'
diff --git a/ingest/source-data/nextclade-field-map.tsv b/ingest/source-data/nextclade-field-map.tsv
new file mode 100644
index 00000000..a495da34
--- /dev/null
+++ b/ingest/source-data/nextclade-field-map.tsv
@@ -0,0 +1,16 @@
+key	value
+seqName	seqName
+clade	clade
+outbreak	outbreak
+lineage	lineage
+coverage	coverage
+totalMissing	missing_data
+totalSubstitutions	divergence
+totalNonACGTNs	nonACGTN
+qc.missingData.status	QC_missing_data
+qc.mixedSites.status	QC_mixed_sites
+qc.privateMutations.status	QC_rare_mutations
+qc.frameShifts.status	QC_frame_shifts
+qc.stopCodons.status	QC_stop_codons
+frameShifts	frame_shifts
+isReverseComplement	is_reverse_complement
\ No newline at end of file
diff --git a/ingest/workflow/snakemake_rules/nextclade.smk b/ingest/workflow/snakemake_rules/nextclade.smk
index 28da0a98..385ad6e4 100644
--- a/ingest/workflow/snakemake_rules/nextclade.smk
+++ b/ingest/workflow/snakemake_rules/nextclade.smk
@@ -56,15 +56,31 @@ rule join_metadata_clades:
     input:
         nextclade="data/nextclade.tsv",
         metadata="data/metadata_raw.tsv",
+        nextclade_field_map=config["nextclade"]["field_map"],
     output:
-        "data/metadata.tsv",
+        metadata="data/metadata.tsv",
     params:
         id_field=config["transform"]["id_field"],
+        nextclade_id_field=config["nextclade"]["id_field"],
     shell:
         """
-        python3 bin/join-metadata-and-clades.py \
-                --id-field {params.id_field} \
-                --metadata {input.metadata} \
-                --nextclade {input.nextclade} \
-                -o {output}
+        export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'`
+
+        csvtk -tl cut -f $SUBSET_FIELDS \
+            {input.nextclade} \
+        | csvtk -tl rename2 \
+            -F \
+            -f '*' \
+            -p '(.+)' \
+            -r '{{kv}}' \
+            -k {input.nextclade_field_map} \
+        | tsv-join -H \
+            --filter-file - \
+            --key-fields {params.nextclade_id_field} \
+            --data-fields {params.id_field} \
+            --append-fields '*' \
+            --write-all ? \
+            {input.metadata} \
+        | tsv-select -H --exclude {params.nextclade_id_field} \
+            > {output.metadata}
         """

From 4122deb1d51bdc3b39769451d3c3504feb41d7fb Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 4 Oct 2023 15:27:43 -0700
Subject: [PATCH 2/2] fix: Windows/DOS line endings in metadata_raw.tsv

This change fixes errors for tsv-utils downstream processing.
For example:

 [tsv-join] Error processing command line arguments: Windows/DOS line ending found for data/metadata_raw.tsv
---
 ingest/bin/ndjson-to-tsv-and-fasta | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ingest/bin/ndjson-to-tsv-and-fasta b/ingest/bin/ndjson-to-tsv-and-fasta
index 017bcc00..d9d7331d 100755
--- a/ingest/bin/ndjson-to-tsv-and-fasta
+++ b/ingest/bin/ndjson-to-tsv-and-fasta
@@ -37,7 +37,8 @@ if __name__ == '__main__':
                 args.metadata_columns,
                 restval="",
                 extrasaction='ignore',
-                delimiter='\t'
+                delimiter='\t',
+                lineterminator='\n',
             )
             metadata_csv.writeheader()