diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 4a51e7e..919b698 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -94,6 +94,8 @@ curate: output_id_field: 'genbank_accession' # The field in the NDJSON record that contains the actual genomic sequence output_sequence_field: 'sequence' + # The field in the NDJSON record that contains the actual GenBank accession + genbank_accession: 'genbank_accession' # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ 'genbank_accession', @@ -111,4 +113,5 @@ curate: 'sra_accessions', 'authors', 'institution', + 'url', ] diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 2ec4ab9..d890038 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -102,9 +102,30 @@ rule curate: """ +rule add_metadata_columns: + """Add columns to metadata + Notable columns: + - [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). + """ + input: + metadata = "data/all_metadata.tsv" + output: + metadata = temp("data/all_metadata_added.tsv") + params: + accession=config['curate']['genbank_accession'] + shell: + """ + csvtk mutate2 -t \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \ + {input.metadata} \ + > {output.metadata} + """ + + rule subset_metadata: input: - metadata="data/all_metadata.tsv", + metadata="data/all_metadata_added.tsv", output: subset_metadata="results/metadata.tsv", params: