From 2631fe828935f39380017fa36d2ba1b78a0c907d Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 1 Nov 2024 13:26:46 -0700 Subject: [PATCH] WIP: use pathoplexus for potential 1A strains --- ingest/rules/nextclade.smk | 73 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index d765daf..661378a 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -13,10 +13,54 @@ like to customize the rules: https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html """ +rule pathoplexus_classify: + """ + Pulls global lineage calls from Pathoplexus API + """ + output: + pathoplexus_tsv="data/pathoplexus_results/global_lineages.tsv", + params: + URL="https://lapis.pathoplexus.org/west-nile/sample/details", + fields="insdcAccessionBase,lineage", + id_field=config["curate"]["output_id_field"], + shell: + """ + curl "{params.URL}?dataFormat=TSV&downloadAsFile=false&fields={params.fields}" \ + | uniq \ + | csvtk -t rename -f "insdcAccessionBase" -n {params.id_field} \ + > {output.pathoplexus_tsv} + """ + +rule select_USA_potential_samples: + """ + Select 1A or "unassigned" sequences from the USA + """ + input: + sequences="results/sequences.fasta", + pathoplexus_tsv="data/pathoplexus_results/global_lineages.tsv", + output: + potential_1A_samples="data/pathoplexus_results/potential_1A_samples.tsv", + sequences="data/potential_1A_sequences.fasta", + params: + id_field=config["curate"]["output_id_field"], + shell: + """ + tsv-filter -H \ + --not-regex 'lineage:1B|[2,3,4,5,6,7,8]' \ + {input.pathoplexus_tsv} \ + > {output.potential_1A_samples} + + augur filter \ + --sequences {input.sequences} \ + --metadata {output.potential_1A_samples} \ + --metadata-id-column {params.id_field} \ + --output-sequences {output.sequences} + """ + rule nextclade_classify: #Classifies sequences into clades using Nextclade input: - sequences="results/sequences.fasta", + sequences="data/potential_1A_sequences.fasta", dataset=config["nextclade"]["nextclade_dataset_path"], output: nextclade_tsv="data/nextclade_results/nextclade.tsv", @@ -55,7 +99,7 @@ rule append_nextclade_columns: metadata="data/raw_metadata.tsv", nextclade_subtypes="data/nextclade_clades.tsv", output: - metadata_all="results/metadata.tsv", + metadata_all="data/metadata_nextclade.tsv", params: id_field=config["curate"]["output_id_field"], nextclade_field=config["nextclade"]["nextclade_field"], @@ -69,3 +113,28 @@ rule append_nextclade_columns: {input.metadata} \ > {output.metadata_all} """ + +rule append_pathoplexus_columns: + """ + Append the pathoplexus results to the metadata + """ + input: + metadata="data/metadata_nextclade.tsv", + pathoplexus_tsv="data/pathoplexus_results/global_lineages.tsv", + output: + metadata="results/metadata.tsv", + params: + id_field=config["curate"]["output_id_field"], + pathoplexus_field=config["curate"]["output_id_field"], + shell: + r""" + augur merge \ + --metadata \ + metadata={input.metadata:q} \ + pathoplexus={input.pathoplexus_tsv:q} \ + --metadata-id-columns \ + metadata={params.id_field:q} \ + pathoplexus={params.pathoplexus_field:q} \ + --output-metadata {output.metadata:q} \ + --no-source-columns + """ \ No newline at end of file