Skip to content

Commit

Permalink
Ingest: Derive URL column during ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Dec 16, 2024
1 parent e304e66 commit 803ed16
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
3 changes: 3 additions & 0 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ curate:
output_id_field: 'genbank_accession'
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: 'sequence'
# The field in the NDJSON record that contains the actual GenBank accession
genbank_accession: 'genbank_accession'
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'genbank_accession',
Expand All @@ -111,4 +113,5 @@ curate:
'sra_accessions',
'authors',
'institution',
'url',
]
23 changes: 22 additions & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,30 @@ rule curate:
"""


rule add_metadata_columns:
"""Add columns to metadata
Notable columns:
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
"""
input:
metadata = "data/all_metadata.tsv"
output:
metadata = temp("data/all_metadata_added.tsv")
params:
accession=config['curate']['genbank_accession']
shell:
"""
csvtk mutate2 -t \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \
{input.metadata} \
> {output.metadata}
"""


rule subset_metadata:
input:
metadata="data/all_metadata.tsv",
metadata="data/all_metadata_added.tsv",
output:
subset_metadata="results/metadata.tsv",
params:
Expand Down

0 comments on commit 803ed16

Please sign in to comment.