diff --git a/config/config_hmpxv1.yaml b/config/config_hmpxv1.yaml index 9ae8e1cf..807a9f38 100644 --- a/config/config_hmpxv1.yaml +++ b/config/config_hmpxv1.yaml @@ -2,14 +2,55 @@ colors: "config/colors_hmpxv1.tsv" auspice_config: "config/auspice_config_hmpxv1.json" build_name: "hmpxv1" -auspice_name: "monkeypox_hmpxv1" +auspice_name: "mpox_mpxv-IIb" ## filter min_date: 2017 min_length: 100000 -sequences_per_group: "--sequences-per-group 40" -group_by: "--group-by year month country" -filters: "--exclude-where outbreak!=hMPXV-1" + + +### Set 1: Non-B.1 sequences: use all +### Set 2: B.1 sequences: small sample across year/country, maybe month +filter: + non_b1: + group_by: "--group-by lineage year country" + sequences_per_group: "--sequences-per-group 50" + other_filters: "outbreak!=hMPXV-1 clade!=IIb" + exclude_lineages: + - B.1 + - B.1.1 + - B.1.2 + - B.1.3 + - B.1.4 + - B.1.5 + - B.1.6 + - B.1.7 + - B.1.8 + - B.1.9 + - B.1.10 + - B.1.11 + - B.1.12 + - B.1.13 + - B.1.14 + - B.1.15 + - B.1.16 + - B.1.17 + - B.1.18 + - B.1.19 + - B.1.20 + - C.1 + b1: + group_by: "--group-by country year" + sequences_per_group: "--subsample-max-sequences 100" + other_filters: "--exclude-where outbreak!=hMPXV-1 clade!=IIb" + +## align +max_indel: 10000 +seed_spacing: 1000 + +## treefix +fix_tree: true +treefix_root: "--root MK783032" ## refine timetree: true diff --git a/config/config_hmpxv1_big.yaml b/config/config_hmpxv1_big.yaml index 09519bc2..ced85742 100644 --- a/config/config_hmpxv1_big.yaml +++ b/config/config_hmpxv1_big.yaml @@ -2,18 +2,37 @@ colors: "config/colors_hmpxv1.tsv" auspice_config: "config/auspice_config_hmpxv1_big.json" build_name: "hmpxv1_big" -auspice_name: "monkeypox_hmpxv1_big" +auspice_name: "mpox_mpxv-IIb-B.1" ## filter -min_date: 2017 -min_length: 100000 -sequences_per_group: "" -group_by: "" -filters: "--exclude-where outbreak!=hMPXV-1" +min_date: 2022 +min_length: 180000 +filter: + b1: + group_by: "--group-by year month country" + sequences_per_group: "--subsample-max-sequences 5000" + other_filters: "outbreak!=hMPXV-1 clade!=IIb" + exclude_lineages: + - A + - A.1 + - A.1.1 + - A.2 + - A.2.1 + - A.2.2 + - A.2.3 + - A.3 + +## align +max_indel: 10000 +seed_spacing: 1000 + +## treefix +fix_tree: true +treefix_root: "--root OP890401" ## refine timetree: true -root: "MK783032 MK783030" +root: "OP890401" clock_rate: 5.7e-5 clock_std_dev: 2e-5 diff --git a/config/config_mpxv.yaml b/config/config_mpxv.yaml index 526d9577..5447e329 100644 --- a/config/config_mpxv.yaml +++ b/config/config_mpxv.yaml @@ -2,13 +2,44 @@ colors: "config/colors_mpxv.tsv" auspice_config: "config/auspice_config_mpxv.json" build_name: "mpxv" -auspice_name: "monkeypox_mpxv" +auspice_name: "mpox_mpxv" ## filter min_date: 1950 min_length: 100000 -group_by: "--group-by clade year country" -sequences_per_group: "--sequences-per-group 50" + +### Set 1: Non-B.1 sequences: use all +### Set 2: B.1 sequences: small sample across year/country, maybe month +filter: + non_b1: + group_by: "--group-by clade year country" + sequences_per_group: "--sequences-per-group 50" + exclude_lineages: + - B.1 + - B.1.1 + - B.1.2 + - B.1.3 + - B.1.4 + - B.1.5 + - B.1.6 + - B.1.7 + - B.1.8 + - B.1.9 + - B.1.10 + - B.1.11 + - B.1.12 + - B.1.13 + - B.1.14 + - B.1.15 + - B.1.16 + - B.1.17 + - B.1.18 + - B.1.19 + - B.1.20 + - C.1 + b1: + group_by: "--group-by country year" + sequences_per_group: "--subsample-max-sequences 100" ## refine timetree: false diff --git a/config/include_hmpxv1.txt b/config/include_hmpxv1.txt new file mode 100644 index 00000000..e69de29b diff --git a/config/include_hmpxv1_big.txt b/config/include_hmpxv1_big.txt new file mode 100644 index 00000000..896db0d5 --- /dev/null +++ b/config/include_hmpxv1_big.txt @@ -0,0 +1 @@ +OP890401 \ No newline at end of file diff --git a/config/include_mpxv.txt b/config/include_mpxv.txt new file mode 100644 index 00000000..e69de29b diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 51c68124..6d6817e4 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -48,53 +48,60 @@ rule exclude_bad: """ -rule include_A_strains: +rule filter: input: + sequences=rules.exclude_bad.output.sequences, metadata=rules.exclude_bad.output.metadata, output: - include_strains=build_dir + "/{build_name}/include_strains.txt", + strains=build_dir + "/{build_name}/{subset}_strains.txt", + log=build_dir + "/{build_name}/{subset}_filter.log", + params: + group_by=lambda w: config["filter"][w.subset]["group_by"], + sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"], + other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""), + exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}" + if "exclude_lineages" in config["filter"][w.subset] + else "", + strain_id=config["strain_id_field"], shell: """ - tsv-filter -H \ - --str-in-fld lineage:'A'\ - {input.metadata} \ - | tsv-select -f 1 \ - > {output.include_strains} + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-strains {output.strains} \ + {params.group_by} \ + {params.sequences_per_group} \ + {params.exclude} \ + {params.other_filters} \ + --output-log {output.log} """ -rule filter: - message: - """ - Filtering to - - {params.sequences_per_group} sequence(s) per {params.group_by!s} - """ +rule filter_merge: input: + strains=lambda w: [ + f"{build_dir}/{w.build_name}/{subset}_strains.txt" + for subset in config["filter"] + ], sequences=rules.exclude_bad.output.sequences, metadata=rules.exclude_bad.output.metadata, - include=rules.include_A_strains.output.include_strains, + include="config/include_{build_name}.txt", output: sequences=build_dir + "/{build_name}/filtered.fasta", metadata=build_dir + "/{build_name}/metadata.tsv", - log=build_dir + "/{build_name}/filter.log", params: - group_by=config.get("group_by", "--group-by clade lineage"), - sequences_per_group=config["sequences_per_group"], - other_filters=config.get("filters", ""), strain_id=config["strain_id_field"], shell: """ augur filter \ + --metadata-id-columns {params.strain_id} \ --sequences {input.sequences} \ --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --include {input.include} \ + --exclude-all \ + --include {input.strains} {input.include}\ --output-sequences {output.sequences} \ - --output-metadata {output.metadata} \ - {params.group_by} \ - {params.sequences_per_group} \ - {params.other_filters} \ - --output-log {output.log} + --output-metadata {output.metadata} """ @@ -198,11 +205,14 @@ rule fix_tree: alignment=build_dir + "/{build_name}/masked.fasta", output: tree=build_dir + "/{build_name}/tree_fixed.nwk", + params: + root=lambda w: config.get("treefix_root", ""), shell: """ python3 scripts/fix_tree.py \ --alignment {input.alignment} \ --input-tree {input.tree} \ + {params.root} \ --output {output.tree} """