From d053086eab6c3aef5101bf3f1a25609398500a3e Mon Sep 17 00:00:00 2001 From: Jennifer Chang <jennifer.chang.bioinform@gmail.com> Date: Wed, 4 Oct 2023 14:45:10 -0700 Subject: [PATCH 1/2] Replace join metadata and clades script with csvtk and tsv append As part of centralizing ingest scripts, replace the join-metadata-and-clades.py script with csvtk and tsv append when there aren't any customized calculations. https://github.com/nextstrain/ingest/pull/23 Relatedly, this commit also adds a nextclade config section where mapping fields from the nextclade output to be appended to the metadata can be specified. Co-authored-by: Jover Lee <joverlee521@gmail.com> --- ingest/bin/join-metadata-and-clades.py | 77 ------------------- ingest/config/config.yaml | 7 ++ ingest/source-data/nextclade-field-map.tsv | 16 ++++ ingest/workflow/snakemake_rules/nextclade.smk | 28 +++++-- 4 files changed, 45 insertions(+), 83 deletions(-) delete mode 100755 ingest/bin/join-metadata-and-clades.py create mode 100644 ingest/source-data/nextclade-field-map.tsv diff --git a/ingest/bin/join-metadata-and-clades.py b/ingest/bin/join-metadata-and-clades.py deleted file mode 100755 index 3a0e919e..00000000 --- a/ingest/bin/join-metadata-and-clades.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import re -import sys -import pandas as pd - -NEXTCLADE_JOIN_COLUMN_NAME = 'seqName' -VALUE_MISSING_DATA = '?' - -column_map = { - "clade": "clade", - "outbreak": "outbreak", - "lineage": "lineage", - "coverage": "coverage", - "totalMissing": "missing_data", - "totalSubstitutions": "divergence", - "totalNonACGTNs": "nonACGTN", - "qc.missingData.status": "QC_missing_data", - "qc.mixedSites.status": "QC_mixed_sites", - "qc.privateMutations.status": "QC_rare_mutations", - "qc.frameShifts.status": "QC_frame_shifts", - "qc.stopCodons.status": "QC_stop_codons", - "frameShifts": "frame_shifts", - "isReverseComplement": "is_reverse_complement", -# "deletions": "deletions", -# "insertions": "insertions" -# "substitutions": "substitutions", -# "aaSubstitutions": "aaSubstitutions" -} - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Joins metadata file with Nextclade clade output", - ) - parser.add_argument("--metadata") - parser.add_argument("--nextclade") - parser.add_argument("--id-field") - parser.add_argument("-o", default=sys.stdout) - return parser.parse_args() - -def main(): - args = parse_args() - - metadata = pd.read_csv(args.metadata, index_col=args.id_field, - sep='\t', low_memory=False, na_filter = False) - - # Read and rename clade column to be more descriptive - clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME, - sep='\t', low_memory=False, na_filter = False) \ - .rename(columns=column_map) - - clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x)) - - # Select columns in column map - clades = clades[list(column_map.values())] - - # Separate long from short columns - short_metadata = metadata.iloc[:,:-2].copy() - long_metadata = metadata.iloc[:,-2:].copy() - - # Concatenate on columns - result = pd.merge( - short_metadata, clades, - left_index=True, - right_index=True, - how='left' - ) - - # Add long columns to back - result = pd.concat([result, long_metadata], axis=1) - - result.to_csv(args.o, index_label=args.id_field, sep='\t') - - -if __name__ == '__main__': - main() diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index bae5fc9e..a8e26f70 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -66,3 +66,10 @@ transform: 'authors', 'institution' ] + +# Params for Nextclade related rules +nextclade: + # Field to use as the sequence ID in the Nextclade file + id_field: 'seqName' + # Fields from a Nextclade file to be renamed (if desired) and appended to a metadata file + field_map: 'source-data/nextclade-field-map.tsv' diff --git a/ingest/source-data/nextclade-field-map.tsv b/ingest/source-data/nextclade-field-map.tsv new file mode 100644 index 00000000..a495da34 --- /dev/null +++ b/ingest/source-data/nextclade-field-map.tsv @@ -0,0 +1,16 @@ +key value +seqName seqName +clade clade +outbreak outbreak +lineage lineage +coverage coverage +totalMissing missing_data +totalSubstitutions divergence +totalNonACGTNs nonACGTN +qc.missingData.status QC_missing_data +qc.mixedSites.status QC_mixed_sites +qc.privateMutations.status QC_rare_mutations +qc.frameShifts.status QC_frame_shifts +qc.stopCodons.status QC_stop_codons +frameShifts frame_shifts +isReverseComplement is_reverse_complement \ No newline at end of file diff --git a/ingest/workflow/snakemake_rules/nextclade.smk b/ingest/workflow/snakemake_rules/nextclade.smk index 28da0a98..385ad6e4 100644 --- a/ingest/workflow/snakemake_rules/nextclade.smk +++ b/ingest/workflow/snakemake_rules/nextclade.smk @@ -56,15 +56,31 @@ rule join_metadata_clades: input: nextclade="data/nextclade.tsv", metadata="data/metadata_raw.tsv", + nextclade_field_map=config["nextclade"]["field_map"], output: - "data/metadata.tsv", + metadata="data/metadata.tsv", params: id_field=config["transform"]["id_field"], + nextclade_id_field=config["nextclade"]["id_field"], shell: """ - python3 bin/join-metadata-and-clades.py \ - --id-field {params.id_field} \ - --metadata {input.metadata} \ - --nextclade {input.nextclade} \ - -o {output} + export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'` + + csvtk -tl cut -f $SUBSET_FIELDS \ + {input.nextclade} \ + | csvtk -tl rename2 \ + -F \ + -f '*' \ + -p '(.+)' \ + -r '{{kv}}' \ + -k {input.nextclade_field_map} \ + | tsv-join -H \ + --filter-file - \ + --key-fields {params.nextclade_id_field} \ + --data-fields {params.id_field} \ + --append-fields '*' \ + --write-all ? \ + {input.metadata} \ + | tsv-select -H --exclude {params.nextclade_id_field} \ + > {output.metadata} """ From 4122deb1d51bdc3b39769451d3c3504feb41d7fb Mon Sep 17 00:00:00 2001 From: Jennifer Chang <jennifer.chang.bioinform@gmail.com> Date: Wed, 4 Oct 2023 15:27:43 -0700 Subject: [PATCH 2/2] fix: Windows/DOS line endings in metadata_raw.tsv This change fixes errors for tsv-utils downstream processing. For example: [tsv-join] Error processing command line arguments: Windows/DOS line ending found for data/metadata_raw.tsv --- ingest/bin/ndjson-to-tsv-and-fasta | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ingest/bin/ndjson-to-tsv-and-fasta b/ingest/bin/ndjson-to-tsv-and-fasta index 017bcc00..d9d7331d 100755 --- a/ingest/bin/ndjson-to-tsv-and-fasta +++ b/ingest/bin/ndjson-to-tsv-and-fasta @@ -37,7 +37,8 @@ if __name__ == '__main__': args.metadata_columns, restval="", extrasaction='ignore', - delimiter='\t' + delimiter='\t', + lineterminator='\n', ) metadata_csv.writeheader()