diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index f2dc1abf..d1fc2896 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -10,14 +10,28 @@ transform: # Fields to rename. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution'] + field_map: [ + 'accession=genbank_accession', + 'accession-rev=genbank_accession_rev', + 'isolate-lineage=strain', + 'sourcedb=database', # necessary for applying geo location rules + 'geo-region=region', + 'geo-location=location', + 'host-name=host', + 'isolate-collection-date=date', + 'release-date=release_date', + 'update-date=update_date', + 'sra-accs=sra_accessions', + 'submitter-names=authors', + 'submitter-affiliation=institution', + ] # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' # Back up strain name field if 'strain' doesn't match regex above strain_backup_fields: ['accession'] # List of date fields to standardize - date_fields: ['date', 'date_released'] + date_fields: ['date', 'release-date', 'update-date'] # Expected date formats present in date fields # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -54,17 +68,19 @@ transform: sequence_field: 'sequence' # Final output columns for the metadata TSV metadata_columns: [ - 'accession', - 'genbank_accession_rev', 'strain', + 'genbank_accession', + 'genbank_accession_rev', 'date', 'region', 'country', 'division', 'location', + 'length', 'host', - 'date_released', - 'sra_accession', + 'release_date', + 'update_date', + 'sra_accessions', 'abbr_authors', 'authors', 'institution' diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv index 00c9db88..57b4f8c5 100644 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -1,17 +1,17 @@ +# Maps the NCBI output TSV column names back to the NCBI mnemonics. +# This list should match the list in +# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics key value -Accession genbank_accession_rev -Source database database -Isolate Lineage strain -Geographic Region region -Geographic Location location -Isolate Collection date collected -Release date released -Update date updated +Accession accession-rev +Source database sourcedb +Isolate Lineage isolate-lineage +Geographic Region geo-region +Geographic Location geo-location +Isolate Collection date isolate-collection-date +Release date release-date +Update date update-date Length length -Host Name host -Isolate Lineage source isolation_source -BioProjects bioproject_accession -BioSample accession biosample_accession -SRA Accessions sra_accession -Submitter Names authors -Submitter Affiliation submitting_organization +Host Name host-name +SRA Accessions sra-accs +Submitter Names submitter-names +Submitter Affiliation submitter-affiliation diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 3f32f9b4..8d271930 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -76,8 +76,7 @@ def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: rule format_ncbi_dataset_report: - # Formats the headers to be the same as before we used NCBI Datasets - # The only fields we do not have equivalents for are "title" and "publications" + # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", ncbi_field_map=config["ncbi_field_map"], @@ -93,8 +92,8 @@ rule format_ncbi_dataset_report: --package {input.dataset_package} \ --fields {params.fields_to_include:q} \ | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ - | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \ - | tsv-select -H -f genbank_accession --rest last \ + | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ + | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} """ @@ -114,7 +113,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column genbank_accession_rev \ + --seq-id-column accession-rev \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \