diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml new file mode 100644 index 00000000..da95a1ef --- /dev/null +++ b/.github/workflows/run-nextclade.yaml @@ -0,0 +1,32 @@ +name: Run Nextclade on all sequences + +on: + workflow_dispatch: + inputs: + dockerImage: + description: "Specific container image to use for build (will override the default of `nextstrain build`)" + required: false + type: string + +jobs: + run-build: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + runtime: aws-batch + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.dockerImage }} + run: | + nextstrain build \ + --detach \ + --cpus 36 \ + --memory 72gib \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + upload_all_nextclade_files \ + -p \ + --configfile profiles/nextclade.yaml \ + --set-threads run_nextclade=12 diff --git a/Snakefile b/Snakefile index 67f975eb..ce810f49 100644 --- a/Snakefile +++ b/Snakefile @@ -4,6 +4,7 @@ from treetime.utils import numeric_date wildcard_constraints: + lineage = r'h1n1pdm|h3n2|vic|yam', segment = r'pb2|pb1|pa|ha|np|na|mp|ns', center = r'who|cdc|crick|niid|crick|vidrl', passage = r'cell|egg', diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml new file mode 100644 index 00000000..44346bc3 --- /dev/null +++ b/profiles/nextclade.yaml @@ -0,0 +1,17 @@ +custom_rules: + - workflow/snakemake_rules/download_from_s3.smk + - profiles/nextclade/run-nextclade.smk + +s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu" + +segments: + - ha + - na + +builds: + h1n1pdm: + lineage: h1n1pdm + h3n2: + lineage: h3n2 + vic: + lineage: vic diff --git a/profiles/nextclade/run-nextclade.smk b/profiles/nextclade/run-nextclade.smk new file mode 100644 index 00000000..d4a8c41e --- /dev/null +++ b/profiles/nextclade/run-nextclade.smk @@ -0,0 +1,72 @@ +rule upload_all_nextclade_files: + input: + files=lambda wildcards: [ + "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=build["lineage"], segment=segment) + for filetype in ("alignment", "nextclade") + for build in config["builds"].values() + for segment in config["segments"] + ] + +rule get_nextclade_dataset_for_lineage_and_segment: + output: + nextclade_dir=directory("nextclade_dataset/{lineage}_{segment}/"), + shell: + """ + nextclade3 dataset get \ + -n flu_{wildcards.lineage}_{wildcards.segment} \ + --output-dir {output.nextclade_dir} + """ + +rule run_nextclade: + input: + nextclade_dir="nextclade_dataset/{lineage}_{segment}/", + sequences="data/{lineage}/{segment}.fasta", + output: + alignment="data/upload/s3/{lineage}/{segment}/aligned.fasta", + annotations="data/upload/s3/{lineage}/{segment}/nextclade.tsv", + log: + "logs/run_nextclade_{lineage}_{segment}.txt" + threads: 8 + shell: + """ + nextclade3 run \ + -j {threads} \ + -D {input.nextclade_dir} \ + --output-fasta {output.alignment} \ + --output-tsv {output.annotations} \ + {input.sequences} + """ + +rule upload_alignment: + input: + alignment="data/upload/s3/{lineage}/{segment}/aligned.fasta", + output: + flag="data/upload/s3/alignment_{lineage}_{segment}.done", + params: + s3_dst=config["s3_dst"], + log: + "logs/upload_alignment_{lineage}_{segment}.txt" + shell: + """ + ./scripts/upload-to-s3 \ + --quiet \ + {input.alignment:q} \ + {params.s3_dst:q}/{wildcards.lineage}/{wildcards.segment}/aligned.fasta.xz 2>&1 | tee {output.flag} + """ + +rule upload_nextclade_annotations: + input: + annotations="data/upload/s3/{lineage}/{segment}/nextclade.tsv", + output: + flag="data/upload/s3/nextclade_{lineage}_{segment}.done", + params: + s3_dst=config["s3_dst"], + log: + "logs/upload_nextclade_annotations_{lineage}_{segment}.txt" + shell: + """ + ./scripts/upload-to-s3 \ + --quiet \ + {input.annotations:q} \ + {params.s3_dst:q}/{wildcards.lineage}/{wildcards.segment}/nextclade.tsv.xz 2>&1 | tee {output.flag} + """ diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 9087ddc8..e82a78dd 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -488,10 +488,10 @@ rule annotate_recency_of_submissions: output: node_data = "builds/{build_name}/recency.json", params: - submission_date_field=config["submission_date_field"], - date_bins=config["recency"]["date_bins"], - date_bin_labels=config["recency"]["date_bin_labels"], - upper_bin_label=config["recency"]["upper_bin_label"], + submission_date_field=config.get("submission_date_field"), + date_bins=config.get("recency", {}).get("date_bins"), + date_bin_labels=config.get("recency", {}).get("date_bin_labels"), + upper_bin_label=config.get("recency", {}).get("upper_bin_label"), conda: "../envs/nextstrain.yaml" benchmark: "benchmarks/recency_{build_name}.txt" diff --git a/workflow/snakemake_rules/download_from_s3.smk b/workflow/snakemake_rules/download_from_s3.smk index 05733145..19d06145 100644 --- a/workflow/snakemake_rules/download_from_s3.smk +++ b/workflow/snakemake_rules/download_from_s3.smk @@ -1,3 +1,6 @@ +ruleorder: download_parsed_sequences > parse +ruleorder: download_parsed_metadata > annotate_metadata_with_reference_strains + rule download_sequences: output: sequences="data/{lineage}/raw_{segment}.fasta" @@ -19,3 +22,25 @@ rule download_titers: """ aws s3 cp {params.s3_path} - | gzip -c -d > {output.titers} """ + +rule download_parsed_sequences: + output: + sequences="data/{lineage}/{segment}.fasta" + params: + s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/sequences.fasta.xz" + conda: "../../workflow/envs/nextstrain.yaml" + shell: + """ + aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences} + """ + +rule download_parsed_metadata: + output: + metadata="data/{lineage}/metadata.tsv", + params: + s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/metadata.tsv.xz" + conda: "../../workflow/envs/nextstrain.yaml" + shell: + """ + aws s3 cp {params.s3_path} - | xz -c -d > {output.metadata} + """ diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 3898f067..84b96d3b 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -59,7 +59,7 @@ rule export: metadata = build_dir + "/{build_name}/metadata.tsv", node_data = _get_node_data_by_wildcards, auspice_config = lambda w: config['builds'][w.build_name]['auspice_config'], - lat_longs = config['lat-longs'] + lat_longs = config.get('lat-longs', "config/lat_longs.tsv"), output: auspice_json = "auspice/{build_name}_{segment}.json", root_sequence_json = "auspice/{build_name}_{segment}_root-sequence.json",