From 2684e248edd33a735163a25bb3f695ba0791f968 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 8 Nov 2023 15:53:54 -0800 Subject: [PATCH] NCBI Dataset field name transformations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Originally the field map was created to keep mpox NDJSON backward compatible with field names used from NCBI Virus. However, this constraint is not applicable to dengue.¹ This commit organizes field renaming into two parts. 1. Rename the NCBI output columns to match the NCBI mnemonics² (see "ncbi_field_map:" in `config/config.yaml`) 2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names³ (see "transform: fieldmap:" in `config/config.yaml`) ¹ https://github.com/nextstrain/dengue/pull/13#discussion_r1374892802 ² https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields ³ https://docs.nextstrain.org/projects/ncov/en/latest/reference/metadata-fields.html --- ingest/config/config.yaml | 56 +++++++++++++++---- ingest/source-data/ncbi-dataset-field-map.tsv | 17 ------ .../snakemake_rules/fetch_sequences.smk | 51 ++++------------- 3 files changed, 55 insertions(+), 69 deletions(-) delete mode 100644 ingest/source-data/ncbi-dataset-field-map.tsv diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index f2dc1abf..231a5bb2 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -2,22 +2,52 @@ sources: ['genbank'] # Pathogen NCBI Taxonomy ID ncbi_taxon_id: '12637' -# Renames the NCBI dataset headers -ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv' +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - submitter-names + - submitter-affiliation # Params for the transform rule transform: - # Fields to rename. + # NCBI Fields to rename to Nextstrain field names. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution'] + field_map: [ + 'accession=genbank_accession', + 'accession-rev=genbank_accession_rev', + 'isolate-lineage=strain', + 'sourcedb=database', # necessary for applying geo location rules + 'geo-region=region', + 'geo-location=location', + 'host-name=host', + 'isolate-collection-date=date', + 'release-date=release_date', + 'update-date=update_date', + 'sra-accs=sra_accessions', + 'submitter-names=authors', + 'submitter-affiliation=institution', + ] # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' # Back up strain name field if 'strain' doesn't match regex above - strain_backup_fields: ['accession'] + strain_backup_fields: ['genbank_accession'] # List of date fields to standardize - date_fields: ['date', 'date_released'] + date_fields: ['date', 'release_date', 'update_date'] # Expected date formats present in date fields # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -47,24 +77,26 @@ transform: # User annotations file annotations: 'source-data/annotations.tsv' # ID field used to merge annotations - annotations_id: 'accession' + annotations_id: 'genbank_accession' # Field to use as the sequence ID in the FASTA file - id_field: 'accession' + id_field: 'genbank_accession' # Field to use as the sequence in the FASTA file sequence_field: 'sequence' # Final output columns for the metadata TSV metadata_columns: [ - 'accession', - 'genbank_accession_rev', 'strain', + 'genbank_accession', + 'genbank_accession_rev', 'date', 'region', 'country', 'division', 'location', + 'length', 'host', - 'date_released', - 'sra_accession', + 'release_date', + 'update_date', + 'sra_accessions', 'abbr_authors', 'authors', 'institution' diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv deleted file mode 100644 index 00c9db88..00000000 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ /dev/null @@ -1,17 +0,0 @@ -key value -Accession genbank_accession_rev -Source database database -Isolate Lineage strain -Geographic Region region -Geographic Location location -Isolate Collection date collected -Release date released -Update date updated -Length length -Host Name host -Isolate Lineage source isolation_source -BioProjects bioproject_accession -BioSample accession biosample_accession -SRA Accessions sra_accession -Submitter Names authors -Submitter Affiliation submitting_organization diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 3f32f9b4..efd9fbb7 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -44,57 +44,26 @@ rule extract_ncbi_dataset_sequences: """ -def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: - """ - Return list of NCBI Dataset report field mnemonics for fields that we want - to parse out of the dataset report. The column names in the output TSV - are different from the mnemonics. - - See NCBI Dataset docs for full list of available fields and their column - names in the output: - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields - """ - fields = [ - "accession", - "sourcedb", - "isolate-lineage", - "geo-region", - "geo-location", - "isolate-collection-date", - "release-date", - "update-date", - "length", - "host-name", - "isolate-lineage-source", - "bioprojects", - "biosample-acc", - "sra-accs", - "submitter-names", - "submitter-affiliation", - ] - return ",".join(fields) - - rule format_ncbi_dataset_report: - # Formats the headers to be the same as before we used NCBI Datasets - # The only fields we do not have equivalents for are "title" and "publications" + # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", - ncbi_field_map=config["ncbi_field_map"], output: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include=_get_ncbi_dataset_field_mnemonics, + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ --package {input.dataset_package} \ - --fields {params.fields_to_include:q} \ - | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ - | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \ - | tsv-select -H -f genbank_accession --rest last \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \ + | csvtk rename -t -f accession -n accession-rev \ + | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ + | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} """ @@ -105,6 +74,8 @@ rule format_ncbi_datasets_ndjson: ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", output: ndjson="data/genbank.ndjson", + params: + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), log: "logs/format_ncbi_datasets_ndjson.txt", benchmark: @@ -114,7 +85,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column genbank_accession_rev \ + --seq-id-column accession-rev \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \