Merge pull request #6163 from rlibouba/add_evidencemodeler

add evidencemodeler
galaxyproject · Sep 15, 2024 · 982fe89 · 982fe89
2 parents 3d8e0f1 + 873f4d3
commit 982fe89
Show file tree

Hide file tree

Showing 10 changed files with 4,542 additions and 0 deletions.
diff --git a/tools/evidencemodeler/.shed.yml b/tools/evidencemodeler/.shed.yml
@@ -0,0 +1,9 @@
+categories: [Genome annotation]
+description: EVidenceModeler (EVM) combines ab intio genetic predictions with protein and transcript alignments in weighted consensus genetic structures.
+homepage_url: https://github.com/EVidenceModeler/EVidenceModeler?tab=readme-ov-file
+long_description: |
+   EVM provides a flexible and intuitive framework for combining various types of evidence in a single 
+   automated system for annotating genetic structures.
+owner: iuc
+name: evidencemodeler
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/evidencemodeler
diff --git a/tools/evidencemodeler/evidencemodeler.xml b/tools/evidencemodeler/evidencemodeler.xml
@@ -0,0 +1,132 @@
+<tool id="evidencemodeler" name="EVidenceModeler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>combines ab intio gene predictions, protein and transcript alignments into gene structures</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">EvidenceModeler</xref>
+    </xrefs>
+    <expand macro="requirements"/>
+
+    <command detect_errors="exit_code"><![CDATA[    
+
+    ln -s '$input_genome' ./input_genome.fasta &&
+    ln -s '$input_predictions' ./input_predictions.gff &&
+    ln -s '$input_weights' ./input_weights.txt &&
+    ln -s '$input_proteins' ./input_proteins.gff &&
+    ln -s '$input_transcript' ./input_transcript.gff &&
+
+    EVidenceModeler
+    --sample_id galaxy
+    --genome './input_genome.fasta'
+    --gene_predictions './input_predictions.gff'
+    --weights './input_weights.txt'
+    --protein_alignments './input_proteins.gff'
+    --segmentSize $segmentsize
+    --overlapSize $overlapsize
+
+    #if $input_transcript:
+        --transcript_alignments './input_transcript.gff'
+    #end if
+
+    #if $opt.input_repeat:
+        --repeats '$opt.input_repeat'
+    #end if
+
+    #if $opt.input_terminalexon:
+        --terminalExons '$opt.input_terminalexon'
+    #end if
+
+    --stop_codons $opt.stop_codon
+    --min_intron_length $opt.min_intron_length
+    --search_long_introns $opt.search_long_introns
+    --re_search_intergenic $opt.re_search_intergenic
+    --terminal_intergenic_re_search $opt.terminal_intergenic_re_search
+    
+    ]]></command>
+
+    <inputs>
+        <param name="input_genome" type="data" format="fasta" label="Genome input"/>
+        <param name="input_predictions" type="data" format="gff3" label="Gene predictions input"/>
+        <param name="input_weights" type="data" format="gff3" label="Weights for evidence types file" help="See documentation for formatting: 'Weights' section"/>
+        <param name="input_proteins" type="data" format="gff3" label="Protein alignments input" help="Optional but recommended"/>
+        <param name="input_transcript" type="data" optional="true" format="gff3" label="Transcript alignments input" help="Optional but recommended"/>
+        <param argument="--segmentSize" name="segmentsize" value="100000" type="integer" label="Length of a single sequence" help="This value must be less than 1 MB" />
+        <param argument="--overlapSize" name="overlapsize" value="10000" type="integer" label="Length of sequence overlap between segmented sequences" help="The length must be at least equivalent to one or two expected gene lengths" />
+        <section name="opt" title="Advanced option" expanded="false">
+            <param name="input_repeat" type="data" optional="true" format="gff3" label="Masked genome repeats"/>
+            <param name="input_terminalexon" type="data" optional="true" format="gff3" label="Additional file of terminal exons to be taken into account" help="From long-orfs PASA"/>
+            <param name="stop_codon" argument="--stop_codons" type="select" multiple="true" optional="true" label="List of stop codon" help="For Tetrahymena, set TGA">
+                <option value="TAA,TGA,TAG" selected="true">TAA,TGA,TAG</option>
+                <option value="TAA">TAA</option>                
+                <option value="TGA">TGA</option>
+                <option value="TAG">TAG</option>
+            </param>
+            <param argument="--min_intron_length" type="integer" value="20" label="Minimum length for an intron" help="Default 20 bp" />
+            <param argument="--search_long_introns" type="select" label="Reexamine long introns" help="Can find nested genes, but also can result in false positives">
+                <option value="0" selected="true">Off</option>
+                <option value="1">On</option>
+            </param>
+            <param argument="--re_search_intergenic" type="select" label="Reexamines intergenic regions of minimum length">
+                <option value="0" selected="true">Off</option>
+                <option value="1">On</option>
+            </param>
+            <param argument="--terminal_intergenic_re_search" type="select" label="Reexamines intergenic regions of minimum length">
+                <option value="0" selected="true">Off</option>
+                <option value="1">On</option>
+            </param>
+        </section>
+    </inputs>
+
+    <outputs>
+        <data name='evm_gff' format='gff' label="${tool.name} on ${on_string}: GFF3" from_work_dir="galaxy.EVM.gff3"/>
+        <data name='evm_pep' format='fasta' label="${tool.name} on ${on_string}: PEP" from_work_dir="galaxy.EVM.pep"/>
+    </outputs>
+
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="input_genome" value="genome.fasta"/>
+            <param name="input_predictions" value="gene_predictions.gff3"/>
+            <param name="input_weights" value="weights.txt"/>
+            <param name="input_proteins" value="protein_alignments.gff3"/>
+            <param name="input_transcript" value="transcript_alignments.gff3"/>
+            <param name="segmentsize" value="100000"/>
+            <param name="overlapsize" value="10000"/>
+            <conditional name="opt">
+                <param name="adv" value="true"/>
+                <param name="min_intron_length" value="20"/>
+                <param name="search_long_introns" value="0"/>
+                <param name="re_search_intergenic" value="0"/>
+                <param name="terminal_intergenic_re_search" value="0"/>
+            </conditional>
+            <output name="evm_pep" ftype="fasta">
+                <assert_contents>
+                    <has_text text="evm.model.Contig1.3 evm.TU.Contig1.3  EVM prediction Contig1.3 Contig1:7611-9749(-)"/>
+                    <has_text text="evm.model.Contig1.10 evm.TU.Contig1.10  EVM prediction Contig1.10 Contig1:57371-59941(+)"/>
+                    <has_n_lines n="108" delta="0"/>
+                    <has_n_columns n="1" delta="0"/>
+                </assert_contents>
+            </output>
+            <output name="evm_gff" ftype="gff">
+                <assert_contents>
+                    <has_text text="ID=evm.TU.Contig1.1;Name=EVM%20prediction%20Contig1.1"/>
+                    <has_text text="ID=evm.TU.Contig1.4;Name=EVM%20prediction%20Contig1.4"/>
+                    <has_n_lines n="191" delta="0"/>
+                    <has_n_columns n="9" delta="0"/>
+                    <!-- the sep=";" is used to count the gff properties -->
+                    <has_n_columns n="2" delta="0" sep=";"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+        EvidenceModeler_: EVidenceModeler (aka EVM) is a software package that combines ab intio 
+        gene predictions and protein and transcript alignments into weighted consensus gene structures. 
+        EVM provides a flexible and intuitive framework for combining various types of evidence into a 
+        single automated gene structure annotation system.
+
+        .. _EvidenceModeler: https://github.com/EVidenceModeler/EVidenceModeler.github.io
+    ]]></help>
+    <expand macro="citation"></expand>
+</tool>
diff --git a/tools/evidencemodeler/macros.xml b/tools/evidencemodeler/macros.xml
@@ -0,0 +1,18 @@
+<macros>
+    <token name="@TOOL_VERSION@">2.1.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">evidencemodeler</requirement>
+        </requirements>
+    </xml>
+
+    <xml name="citation">
+        <citations>
+            <citation type="doi">10.1186/gb-2008-9-1-r7</citation>
+            <citation type="doi">10.1080/21501203.2011.606851</citation>
+        </citations>
+    </xml>
+
+</macros>