galaxyproject · bgruening · Apr 13, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/tools/chewbbaca/.shed.yml b/tools/chewbbaca/.shed.yml
@@ -0,0 +1,15 @@
+categories: 
+- Variant Analysis
+description: BSR-Based Allele Calling Algorithm
+long_description: chewBBACA is a comprehensive pipeline including a set of functions for the creation and validation of whole genome and core genome MultiLocus Sequence Typing (wg/cgMLST) schemas, providing an allele calling algorithm based on Blast Score Ratio that can be run in multiprocessor settings and a set of functions to visualize and validate allele variation in the loci. chewBBACA performs the schema creation and allele calls on complete or draft genomes.
+homepage_url: https://github.com/B-UMMI/chewBBACA/tree/master
+name: chewbbaca
+owner: iuc
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/chewbbaca
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Wrapper for {{ tool_name }}."
+suite:
+  name: "suite_chewbbaca"
+  description: "A suite of Galaxy tools designed to work with the chewbbaca-tools collection."
+  type: repository_suite_definition
diff --git a/tools/chewbbaca/AlleleCall.xml b/tools/chewbbaca/AlleleCall.xml
@@ -0,0 +1,172 @@
+<tool id="AlleleCall" name="ChewBBACA AlleleCall" version="@CHEW_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
-<tool id="AlleleCall" name="ChewBBACA AlleleCall" version="@CHEW_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+<tool id="AlleleCall" name="ChewBBACA AlleleCall" version="@CHEW_VERSION@+galaxy0" python_template_version="3.5" profile="@PROFILE@">
-<tool id="AlleleCall" name="ChewBBACA AlleleCall" version="@CHEW_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+<tool id="AlleleCall" name="ChewBBACA AlleleCall" version="@CHEW_VERSION@+galaxy0" python_template_version="3.5" profile="@PROFILE@">
+    <description>Determine the allelic profiles of a set of genomes</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+        mkdir ./input &&
+        mkdir ./schema &&
+        #for $file in $input_file
+        cp $file ./input/${file.element_identifier} &&
+        #end for
+        unzip $input_schema -d ./schema &&
+        chewBBACA.py AlleleCall
+            #if str($training_file) != 'None'
+                --ptf $training_file
+            #end if            
+            --bsr $blast_score_ratio
+            --l $minimum_length
+            --t $translation_table
+            --st $size_threshold
+            $no_inferred
+            --pm $prodigal_mode
+            --mode $mode
+            --force-continue
+            -i ./input -g ./schema/schema_seed/ -o ./output
+    ]]></command>
+    <inputs>
+        <param format="fasta" name="input_file" type="data" multiple="true" label="Genome assemblies in FASTA format"/>
+        <param format="zip" name="input_schema" type="data" multiple="true" label="Schema Files in zip format" help="The schema directory contains the loci FASTA files and a folder named 'short' that contains the FASTA files with the loci representative alleles."/>
+        <section name="advanced" title="Advanced options">
+            <param argument="--training-file" type="data" format="binary" label="Prodigal training file" optional="true" />
+            <param argument="--blast-score-ratio" type="float" min="0.0" max="1.0" value="0.6" label="BLAST Score Ratio value" /> 
+            <param argument="--minimum-length" type="integer" min="0" value="201" label="Minimum sequence length value"/>
+            <param argument="--translation-table" type="integer" min="0" value="11" label="Genetic code used to predict genes and to translate coding sequences"/>
+            <param argument="--size-threshold" type="float" min="0" value="0.2" label="CDS size variation threshold"/>
+            <param argument="--no-inferred" type="boolean" truevalue="--no-inferred" falsevalue="" checked="false" label="Add the sequences of inferred alleles (INF) to the schema" help="Use this parameter if the schema is being accessed by multiple processes/users simultaneously." />
+            <param argument="--prodigal-mode" type="select" label="Prodigal Mode" help="single for finished genomes, reasonable quality draft genomes and big viruses. meta for metagenomes, low quality draft genomes, small viruses, and small plasmids">
+                <option value="single" selected="true">
+                        single
+                    </option>
+                    <option value="meta">
+                        meta
+                    </option>
+            </param>
+            <param argument="--mode" type="select" label="Execution mode" >
+                <option value="1">Only exact matches at DNA level</option>
+                <option value="2">Exact matches at DNA and Protein level </option>
+                <option value="3">Exact matches and minimizer-based clustering to find similar alleles based on BSR+0.1 </option>
+                <option value="4" selected="true">Exact matches and minimizer-based clustering to find similar alleles based on BSR+0.1 </option>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+        <collection name="allelecall_results" type="list" label="${tool.name} on ${on_string}: AlleleCall Results">
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="./output"/>
+        </collection>
+        <collection name="allelcall_log" type="list" label="${tool.name} on ${on_string}: AlleleCall Logs">
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.txt$" format="txt" directory="./output"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="GCA_000007265.1_ASM726v1_genomic.fna"/>
+            <param name="input_schema" value="GCA_000007265.1_ASM726v1_schema_seed.zip"/>
+            <output_collection name="allelecall_results" type="list">
+                <element name="loci_summary_stats" file="loci_summary_stats.tsv" compare="sim_size"/>
+                <element name="paralogous_loci" ftype="tabular">
+                    <assert_contents>
+                        <has_text_matching expression="Genome.*Loci.*CDS"/>
+                    </assert_contents>
+                </element>
+                <element name="results_alleles" ftype="tabular">
+                    <assert_contents>
+                        <has_text_matching expression="1.*1.*NIPHEM.*1.*1"/>
+                        <has_text_matching expression="GCA_000007265.*1"/>
+                    </assert_contents>
+                </element>
+                <element name="results_alleles" file="results_alleles.tsv" compare="sim_size"/>
+                <element name="results_statistics" file="results_statistics.tsv" compare="sim_size"/>
+            </output_collection>
+            <output_collection name="allelcall_log" type="list">
+                <element name="logging_info" ftype="txt">
+                    <assert_contents>
+                        <has_text_matching expression="Used a BSR of: 0.6"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+chewBBACA version: 3.3.3
+Authors: Rafael Mamede, Pedro Cerqueira, Mickael Silva, João Carriço, Mário Ramirez
+Github: https://github.com/B-UMMI/chewBBACA
+Documentation: https://chewbbaca.readthedocs.io/en/latest/index.html
+Contacts: [email protected]
+
+==========================
+  chewBBACA - AlleleCall
+==========================
+Performs allele calling to determine the allelic profiles of a set of samples in FASTA format. The
+process identifies new alleles, assigns an integer identifier to those alleles and adds them to the
+schema.
+
+  -i, --input-files [INPUT_FILES]               Path to the directory with the genome FASTA files or
+                                                to a file that contains a list of full paths to the
+                                                FASTA files, one per line. (default: None)
+
+  -g, --schema-directory SCHEMA_DIRECTORY       Path to the schema directory. The schema directory
+                                                contains the loci FASTA files and a folder named
+                                                "short" that contains the FASTA files with the loci
+                                                representative alleles. (default: None)
+
+  -o, --output-directory OUTPUT_DIRECTORY       Output directory where the allele calling results
+                                                will be stored (will create a subdirectory named
+                                                "results_<TIMESTAMP>" if the path passed by the user
+                                                already exists). (default: None)
+
+  --ptf, --training-file PTF_PATH               Path to the Prodigal training file. Default is to
+                                                get the training file from the schema's directory
+                                                (default: None)
+
+  --bsr, --blast-score-ratio BLAST_SCORE_RATIO  BLAST Score Ratio value. Sequences with alignments
+                                                with a BSR value equal to or greater than this value
+                                                will be considered as sequences from the same gene.
+                                                (default: None)
+
+  --l, --minimum-length MINIMUM_LENGTH          Minimum sequence length accepted for a coding
+                                                sequence to be included in the schema. (default:
+                                                None)
+
+  --t, --translation-table TRANSLATION_TABLE    Genetic code used to predict genes and to translate
+                                                coding sequences. Must match the genetic code used
+                                                to create the training file. (default: None)
+
+  --st, --size-threshold SIZE_THRESHOLD         CDS size variation threshold. At the default value
+                                                of 0.2, alleles with size that deviates +-20 percent
+                                                from the locus length mode will be classified as
+                                                ASM/ALM (default: None)
+
+  --pm, --prodigal-mode {single,meta}           Prodigal running mode ("single" for finished
+                                                genomes, reasonable quality draft genomes and big
+                                                viruses. "meta" for metagenomes, low quality draft
+                                                genomes, small viruses, and small plasmids).
+                                                (default: single)
+
+  --no-inferred                                 If provided, the process will not add the sequences
+                                                of inferred alleles (INF) to the schema. Allelic
+                                                profiles will still include the allele identifiers
+                                                attributed to the inferred alleles. Use this
+                                                parameter if the schema is being accessed by
+                                                multiple processes/users simultaneously. (default:
+                                                False)
+
+  --mode {1,2,3,4}                              Execution mode (1: only exact matches at DNA level;
+                                                2: exact matches at DNA and Protein level; 3: exact
+                                                matches and minimizer-based clustering to find
+                                                similar alleles based on BSR+0.1; 4: runs the full
+                                                process to find exact matches and similar matches
+                                                based on BSR value, including the determination of
+                                                new representative alleles to add to the schema).
+                                                (default: 4)
+
+
+It is strongly advised to perform allele calling with the default schema parameters to ensure more
+consistent results. Module documentation available at
+https://chewbbaca.readthedocs.io/en/latest/user/modules/AlleleCall.html
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
diff --git a/tools/chewbbaca/CreateSchema.xml b/tools/chewbbaca/CreateSchema.xml
@@ -0,0 +1,119 @@
+<tool id="CreateSchema" name="chewBBACA CreateSchema" version="@CHEW_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Create a gene-by-gene schema</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+        mkdir ./input &&
+        #for $file in $input_file
+        cp $file ./input/${file.element_identifier} &&
-        cp $file ./input/${file.element_identifier} &&
+        ln -sf '$file' './input/${file.element_identifier}' &&
-        cp $file ./input/${file.element_identifier} &&
+        ln -sf '$file' './input/${file.element_identifier}' &&
+        #end for
+        chewBBACA.py CreateSchema 
+            #if str($training_file) != 'None'
+                --ptf '$training_file'
+            #end if
+            --bsr $blast_score_ratio
+            --l $minimum_length
+            --t $translation_table
+            --st $size_threshold
+            --pm $prodigal_mode
+            -i ./input -o ./output &&
+        cd output/ &&
+        zip -r schema_seed.zip ./schema_seed
+    ]]></command>
+    <inputs>
+        <param format="fasta" name="input_file" type="data" multiple="True" label="Genome assemblies in FASTA format"/>
+        <section name="advanced" title="Advanced options">
+            <param argument="--training-file" type="data" format="binary" label="Prodigal training file" optional="true" />
+            <param argument="--blast_score-ratio" type="float" min="0.0" max="1.0" value="0.6" label="BLAST Score Ratio value" /> 
+            <param argument="--minimum-length" type="integer" min="0" value="201" label="Minimum sequence length value"/>
+            <param argument="--translation-table" type="integer" min="0" value="11" label="Genetic code used to predict genes and to translate coding sequences"/>
+            <param argument="--size-threshold" type="float" min="0" value="0.2" label="CDS size variation threshold"/>
+            <param argument="--prodigal-mode" type="select" label="Prodigal Mode" help="single for finished genomes, reasonable quality draft genomes and big viruses. meta for metagenomes, low quality draft genomes, small viruses, and small plasmids">
+                <option value="single" selected="true">
+                        single
+                    </option>
+                    <option value="meta">
+                        meta
+                    </option>
+            </param>
+        </section>
+        <section name="output" title="Output options">
+            <param name="show_cds_invalid" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output invalid CDS file?"/>
+            <param name="show_cds_coord" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output CDS coordinates File?"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data format="zip" name="schema" from_work_dir="output/schema_seed.zip" label="${tool.name} on ${on_string}: Schema files"/>
+        <data format="txt" name="txt_file" from_work_dir="output/invalid_cds.txt" label="${tool.name} on ${on_string}: invalid CDS">
+            <filter>output['show_cds_invalid']</filter>
+        </data>
+        <data format="tsv" name="tst_file" from_work_dir="output/cds_coordinates.tsv" label="${tool.name} on ${on_string}: CDS coordinates">
+            <filter>output['show_cds_coord']</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+        <param name="input_file" value="GCA_000007265.1_ASM726v1_genomic.fna"/>
+        <output name="schema" file="GCA_000007265.1_ASM726v1_schema_seed.zip" compare="sim_size"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+chewBBACA version: 3.3.3
+Authors: Rafael Mamede, Pedro Cerqueira, Mickael Silva, João Carriço, Mário Ramirez
+Github: https://github.com/B-UMMI/chewBBACA
+Documentation: https://chewbbaca.readthedocs.io/en/latest/index.html
+Contacts: [email protected]
+
+============================
+chewBBACA - CreateSchema
+============================
+usage: 
+Create schema from genome assemblies.
+
+positional arguments:
+CreateSchema
+
+options:
+
+-i, --input-files [INPUT_FILES]               Path to the directory that contains the input FASTA
+                                                files. Alternatively, a file with a list of full
+                                                paths to FASTA files, one per line. (default: None)
+
+-o, --output-directory OUTPUT_DIRECTORY       Output directory where the process will store
+                                                intermediate files and create the schema's
+                                                directory. (default: None)
+
+--ptf, --training-file PTF_PATH               Path to the Prodigal training file. (default: None)
+
+--bsr, --blast-score-ratio BLAST_SCORE_RATIO  BLAST Score Ratio value. Sequences with alignments
+                                                with a BSR value equal to or greater than this value
+                                                will be considered as sequences from the same gene.
+                                                (default: 0.6)
+
+--l, --minimum-length MINIMUM_LENGTH          Minimum sequence length value. Coding sequences
+                                                shorter than this value are excluded. (default: 201)
+
+--t, --translation-table TRANSLATION_TABLE    Genetic code used to predict genes and to translate
+                                                coding sequences. (default: 11)
+
+--st, --size-threshold SIZE_THRESHOLD         CDS size variation threshold. Added to the schema's
+                                                config file and used to identify alleles with a
+                                                length value that deviates from the locus length
+                                                mode during the allele calling process. (default:
+                                                0.2)
+
+--pm, --prodigal-mode {single,meta}           Prodigal running mode ("single" for finished
+                                                genomes, reasonable quality draft genomes and big
+                                                viruses. "meta" for metagenomes, low quality draft
+                                                genomes, small viruses, and small plasmids).
+                                                (default: single)
+
+
+It is strongly advised to provide a training file to create a schema. Module documentation available
+at https://chewbbaca.readthedocs.io/en/latest/user/modules/CreateSchema.html
+        ]]>
+    </help>
+    <expand macro="citations" />
+</tool>