From d6330653915e9afb6472501238edb2c5f9525a28 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 11 Dec 2024 16:44:13 -0800
Subject: [PATCH] Replace genbank_accession with accession

This simplifies USVI data merge
---
 ingest/defaults/config.yaml                 | 16 ++++++-------
 ingest/rules/fetch_from_ncbi.smk            |  6 ++---
 phylogenetic/rules/merge_sequences_usvi.smk | 26 +--------------------
 3 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 919b698..dc8efb0 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -45,8 +45,8 @@ curate:
   # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    accession: genbank_accession
-    accession-rev: genbank_accession_rev
+    accession: accession
+    accession-version: accession_version
     isolate-lineage: strain
     sourcedb: database
     geo-region: region
@@ -62,7 +62,7 @@ curate:
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'
   # Back up strain name field to use if 'strain' doesn't match regex above
-  strain_backup_fields: ['genbank_accession']
+  strain_backup_fields: ['accession']
   # List of date fields to standardize to ISO format YYYY-MM-DD
   date_fields: ['date', 'release_date', 'update_date']
   # List of expected date formats that are present in the date fields provided above
@@ -89,17 +89,17 @@ curate:
   # The path should be relative to the ingest directory
   annotations: "defaults/annotations.tsv"
   # The ID field in the metadata to use to merge the manual annotations
-  annotations_id: 'genbank_accession'
+  annotations_id: 'accession'
   # The ID field in the metadata to use as the sequence id in the output FASTA file
-  output_id_field: 'genbank_accession'
+  output_id_field: 'accession'
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: 'sequence'
   # The field in the NDJSON record that contains the actual GenBank accession
-  genbank_accession: 'genbank_accession'
+  genbank_accession: 'accession'
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
-    'genbank_accession',
-    'genbank_accession_rev',
+    'accession',
+    'accession_version',
     'strain',
     'date',
     'region',
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
index 3c32e42..ca6cedb 100644
--- a/ingest/rules/fetch_from_ncbi.smk
+++ b/ingest/rules/fetch_from_ncbi.smk
@@ -66,8 +66,8 @@ rule format_ncbi_dataset_report:
             --elide-header \
             | csvtk fix-quotes -Ht \
             | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
-            | csvtk rename -t -f accession -n accession-rev \
-            | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | csvtk rename -t -f accession -n accession_version \
+            | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
             | csvtk del-quotes -t \
             | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
@@ -89,7 +89,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column accession-rev \
+            --seq-id-column accession_version \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \
diff --git a/phylogenetic/rules/merge_sequences_usvi.smk b/phylogenetic/rules/merge_sequences_usvi.smk
index ffc7a50..3765937 100644
--- a/phylogenetic/rules/merge_sequences_usvi.smk
+++ b/phylogenetic/rules/merge_sequences_usvi.smk
@@ -21,35 +21,11 @@ This part of the workflow usually includes the following steps:
 
 """
 
-rule add_metadata_columns:
-    """Add columns to metadata
-
-    Notable columns:
-    - genbank_accession: GenBank accession for Auspice to generate a URL to the NCBI GenBank record.
-    - [NEW] accession: The GenBank accession. Added to go alongside USVI accession.
-    - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). Added to go alongside USVI url.
-    """
-    input:
-        metadata = "data/metadata.tsv"
-    output:
-        metadata = "data/metadata_modified.tsv"
-    shell:
-        """
-        csvtk mutate2 -tl \
-          -n url \
-          -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \
-          {input.metadata} \
-        | csvtk mutate2 -tl \
-          -n accession \
-          -e '$genbank_accession' \
-        > {output.metadata}
-        """
-
 rule append_usvi:
     """Appending USVI sequences"""
     input:
         sequences = "data/sequences.fasta",
-        metadata = "data/metadata_modified.tsv",
+        metadata = "data/metadata.tsv",
         usvi_sequences = "data/sequences_usvi.fasta",
         usvi_metadata = "data/metadata_usvi.tsv"
     output: