From 5ee42473d7bba44a0730af599ee88dbfefeff1d4 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 29 Nov 2024 09:02:42 -0800 Subject: [PATCH] wip: assemble nextclade dataset rules --- .../rules/assemble_nextclade_dataset.smk | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 nextclade/rules/assemble_nextclade_dataset.smk diff --git a/nextclade/rules/assemble_nextclade_dataset.smk b/nextclade/rules/assemble_nextclade_dataset.smk new file mode 100644 index 0000000..930aa64 --- /dev/null +++ b/nextclade/rules/assemble_nextclade_dataset.smk @@ -0,0 +1,71 @@ +""" +This part of the workflow organizes the files for a Nextstrain dataset. + +REQUIRED INPUTS: + + tree = auspice/tree.nwk + reference_files = reference.fasta, reference.gff3 + pathogen_json = defaults/pathogen.json + doc_files = defaults/README.md, defaults/CHANGELOG.md + example query = defaults/example_sequences.fasta + +OUTPUTS: + + dataset_zip = dataset.zip + test_output = results of testing the example query against the Nextclade dataset + +This part of the workflow usually includes the following steps: + + - zipping the final Nextclade dataset + - running a test of the final Nextclade dataset + +See the Nextclade documentation for more information: + + - https://github.com/nextstrain/nextclade_data/blob/master/docs/dataset-creation-guide.md + - https://github.com/nextstrain/nextclade_data/blob/master/docs/dataset-curation-guide.md + +""" + +rule assemble_dataset: + input: + tree="auspice/tree.json", + reference="defaults/reference.fasta", + annotation="defaults/reference.gff3", + sequences="defaults/example_sequences.fasta", + pathogen="defaults/pathogen.json", + readme="defaults/README.md", + changelog="defaults/CHANGELOG.md", + output: + tree="dataset/tree.json", + reference="dataset/reference.fasta", + annotation="dataset/genome_annotation.gff3", + sequences="dataset/sequences.fasta", + pathogen="dataset/pathogen.json", + readme="dataset/README.md", + changelog="dataset/CHANGELOG.md", + dataset_zip="dataset.zip", + shell: + """ + cp {input.tree} {output.tree} + cp {input.reference} {output.reference} + cp {input.annotation} {output.annotation} + cp {input.sequences} {output.sequences} + cp {input.pathogen} {output.pathogen} + cp {input.readme} {output.readme} + cp {input.changelog} {output.changelog} + zip -rj dataset.zip dataset/* + """ + +rule test: + input: + dataset="dataset.zip", + sequences="defaults/example_sequences.fasta", + output: + output=directory("test_out"), + shell: + """ + nextclade3 run \ + --input-dataset {input.dataset} \ + --output-all {output.output} \ + {input.sequences} + """