From d6330653915e9afb6472501238edb2c5f9525a28 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 11 Dec 2024 16:44:13 -0800 Subject: [PATCH] Replace genbank_accession with accession This simplifies USVI data merge --- ingest/defaults/config.yaml | 16 ++++++------- ingest/rules/fetch_from_ncbi.smk | 6 ++--- phylogenetic/rules/merge_sequences_usvi.smk | 26 +-------------------- 3 files changed, 12 insertions(+), 36 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 919b698..dc8efb0 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -45,8 +45,8 @@ curate: # The original field names should match the ncbi_datasets_fields provided above. # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: - accession: genbank_accession - accession-rev: genbank_accession_rev + accession: accession + accession-version: accession_version isolate-lineage: strain sourcedb: database geo-region: region @@ -62,7 +62,7 @@ curate: # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: '^.+$' # Back up strain name field to use if 'strain' doesn't match regex above - strain_backup_fields: ['genbank_accession'] + strain_backup_fields: ['accession'] # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: ['date', 'release_date', 'update_date'] # List of expected date formats that are present in the date fields provided above @@ -89,17 +89,17 @@ curate: # The path should be relative to the ingest directory annotations: "defaults/annotations.tsv" # The ID field in the metadata to use to merge the manual annotations - annotations_id: 'genbank_accession' + annotations_id: 'accession' # The ID field in the metadata to use as the sequence id in the output FASTA file - output_id_field: 'genbank_accession' + output_id_field: 'accession' # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: 'sequence' # The field in the NDJSON record that contains the actual GenBank accession - genbank_accession: 'genbank_accession' + genbank_accession: 'accession' # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ - 'genbank_accession', - 'genbank_accession_rev', + 'accession', + 'accession_version', 'strain', 'date', 'region', diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 3c32e42..ca6cedb 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -66,8 +66,8 @@ rule format_ncbi_dataset_report: --elide-header \ | csvtk fix-quotes -Ht \ | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \ - | csvtk rename -t -f accession -n accession-rev \ - | csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \ + | csvtk rename -t -f accession -n accession_version \ + | csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \ | csvtk del-quotes -t \ | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} @@ -89,7 +89,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession-rev \ + --seq-id-column accession_version \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \ diff --git a/phylogenetic/rules/merge_sequences_usvi.smk b/phylogenetic/rules/merge_sequences_usvi.smk index ffc7a50..3765937 100644 --- a/phylogenetic/rules/merge_sequences_usvi.smk +++ b/phylogenetic/rules/merge_sequences_usvi.smk @@ -21,35 +21,11 @@ This part of the workflow usually includes the following steps: """ -rule add_metadata_columns: - """Add columns to metadata - - Notable columns: - - genbank_accession: GenBank accession for Auspice to generate a URL to the NCBI GenBank record. - - [NEW] accession: The GenBank accession. Added to go alongside USVI accession. - - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). Added to go alongside USVI url. - """ - input: - metadata = "data/metadata.tsv" - output: - metadata = "data/metadata_modified.tsv" - shell: - """ - csvtk mutate2 -tl \ - -n url \ - -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \ - {input.metadata} \ - | csvtk mutate2 -tl \ - -n accession \ - -e '$genbank_accession' \ - > {output.metadata} - """ - rule append_usvi: """Appending USVI sequences""" input: sequences = "data/sequences.fasta", - metadata = "data/metadata_modified.tsv", + metadata = "data/metadata.tsv", usvi_sequences = "data/sequences_usvi.fasta", usvi_metadata = "data/metadata_usvi.tsv" output: