galaxyproject · bernt-matthias · Jun 16, 2023 · Dec 19, 2022 · Jan 20, 2023 · Mar 10, 2023
diff --git a/tools/proteinortho/proteinortho.xml b/tools/proteinortho/proteinortho.xml
@@ -44,21 +44,20 @@
         proteinortho 
             --project=result
             --cpus="\${GALAXY_SLOTS:-4}"
-            --ram="\${GALAXY_MEMORY_MB:-16000}"
             #if $more_options.selfblast:
                 $more_options.selfblast
             #end if
             #if $more_options.singles:
                 $more_options.singles
             #end if
             --p=$p
-            --e=$evalue
+            --e=$more_options.evalue
             --conn=$conn
             #if $more_options.cov:
                 --cov=$more_options.cov
             #end if
-            #if $more_options.sim:
-                --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$more_options.sim/100}"`
+            #if $sim:
+                --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$sim/100}"`
             #end if
             #if $more_options.identity:
                 --cov=$more_options.identity
@@ -100,11 +99,11 @@
             <option value="blatp">BLAT (aminoacid sequences)</option>
             <option value="blatn">BLAT (nucleotide sequences)</option>
         </param>
-        <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="This is the main parameter for the generation of the reciprocal best hit graph. Larger values results in more false positives (connections between proteins)."/>
-        <param argument="--conn" type="float" value="0.1" min="0." max="10." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters."/>
+        <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal reciprocal similarity in %" help="This and --evalue are main parameters for the generation of the reciprocal best hit graph. 1 = only the best reciprocal hits are reported, 0 = all possible reciprocal blast matches (within the E-value cutoff) are reported."/>
+        <param argument="--conn" type="float" value="0.1" min="0." max="1." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters. A value of 0 corresponds to no clustering."/>
         <section name="more_options" title="Additional Options" expanded="False">
+            <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
             <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
-            <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal sequence similarity in %"/>
             <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
             <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
             <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
@@ -124,42 +123,73 @@
             <when value="specified">
                 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/>
                 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/>
-                <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Minimal percent identity of best blast hits"/>
+                <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Weight of adjacencies vs. sequence similarity" help="alpha[FF-adj score] + (1−alpha)[BLAST score]"/>
                 <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accoringly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> 
             </when>
         </conditional>
     </inputs>
     <outputs>
-        <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph"/>
-        <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv"/>
-        <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph"/>
+        <data name="blastgraph" 
+              format="tabular" 
+              label="${tool.name} on ${on_string}: RBH graph" 
+              from_work_dir="result.blast-graph">
+            <actions>
+                <action name="column_names" type="metadata"
+                    default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
+            </actions>
+        </data>
+        <data name="proteinortho" 
+              format="tabular" 
+              label="${tool.name} on ${on_string}: orthology-groups" 
+              from_work_dir="result.proteinortho.tsv">
+            <actions>
+                <action name="column_names" type="metadata"
+                    default="species,genes,alg.-conn.,${','.join([ f.element_identifier for f in $input_files ])}"/>
+            </actions>
+        </data>
+        <data name="proteinorthograph" 
+              format="tabular" 
+              label="${tool.name} on ${on_string}: orthology-pairs" 
+              from_work_dir="result.proteinortho-graph">
+            <actions>
+                <action name="column_names" type="metadata"
+                    default="seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba"/>
+            </actions>
+        </data>
+        </data>
     </outputs>
     <tests>
         <test expect_num_outputs="3"> <!-- test normal -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
+            <param name="input_files" value="L.fasta,C.fasta,M.fasta,E.fasta"/>
+            <param name="p" value="diamond"/>
             <expand macro="test_outputs"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- various parameter -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
-            <param name="evalue" value="1"/>
+            <param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
+            <param name="p" value="diamond"/>
             <param name="conn" value="1"/>
-            <param name="cov" value="42"/>
             <param name="sim" value="42"/>
-            <param name="identity" value="42"/>
-            <param name="selfblast" value="true"/>
-            <param name="singles" value="true"/>
+            <section name="more_options">
+                <param name="cov" value="42"/>
+                <param name="identity" value="42"/>
+                <param name="selfblast" value="true"/>
+                <param name="singles" value="true"/>
+            </section>
             <expand macro="test_outputs"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
             </assert_command>
         </test>
         <test expect_num_outputs="3"> <!-- synteny -->
-            <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/>
-            <param name="input_files_syn" value="L.gff,C.gff,C2.gff,E.gff,M.gff"/>
-            <param name="synteny_options" value="specified"/>
+            <param name="input_files" value="L.fasta,C.fasta,M.fasta,E.fasta"/>
+            <param name="input_files_syn" value="L.gff,C.gff,E.gff,M.gff"/>
+            <param name="p" value="diamond"/>
+            <conditional name="synteny">
+                <param name="synteny_options" value="specified"/>
+            </conditional>
             <expand macro="test_outputs"/>
             <assert_command>
                 <has_text text="--p=diamond"/>
@@ -279,5 +309,5 @@ Proteinortho is a tool to detect orthologous proteins/genes within different spe
 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
 ]]>
     </help>
-    <expand macro="citations"/>
+    <expand macro="citations" />
 </tool>
diff --git a/tools/proteinortho/proteinortho_macros.xml b/tools/proteinortho/proteinortho_macros.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <macros>
-   <token name="@TOOL_VERSION@">6.1.5</token>
-   <token name="@WRAPPER_VERSION@">0</token>
+   <token name="@TOOL_VERSION@">6.2.0</token>
+   <token name="@WRAPPER_VERSION@">1</token>
    <token name="@PROFILE@">20.09</token>
    <xml name="citations">
         <citations>
@@ -12,12 +12,10 @@
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">proteinortho</requirement>
-            <!-- blast, blat, and last are not in the biopython requirements
-                 diamond is, but latest version does not work: https://gitlab.com/paulklemm_PHD/proteinortho/-/issues/55 -->
-            <requirement type="package" version="2.0.15">diamond</requirement>
+            <requirement type="package" version="2.1.4">diamond</requirement>
             <requirement type="package" version="2.13.0">blast</requirement>
             <requirement type="package" version="377">ucsc-blat</requirement>
-            <requirement type="package" version="1418">last</requirement>
+            <requirement type="package" version="1422">last</requirement>
         </requirements>
     </xml>
     <xml name="version_command">

diff --git a/tools/proteinortho/proteinortho_summary.xml b/tools/proteinortho/proteinortho_summary.xml
@@ -7,9 +7,6 @@
     <expand macro="version_command"/>
     <command detect_errors="exit_code"><![CDATA[
         export TERM=dumb &&
-        ## TODOs:
-        ## - check if 2>&1 can be removed https://gitlab.com/paulklemm_PHD/proteinortho/-/merge_requests/9
-        ## - include output0.tsv as Galaxy output?
         proteinortho_summary.pl 
             $queryfile
             #if $queryfile2:
@@ -19,6 +16,7 @@
             | awk '/^$/ && !f{f=1;next}1' ## remove potentially present 1st empty line
             | awk 'BEGIN{i=0} /^$/{i+=1}{print > ("output" i ".tsv")}' ## split file at empty lines
         &&
+        mv output0.tsv '$distribution' &&
         mv output1.tsv '$adjacencyMat' &&
         mv output2.tsv '$average1paths' &&
         mv output3.tsv '$adjacencyMatSquared' &&
@@ -29,14 +27,20 @@
         <param name="queryfile2" type="data" format="tabular" optional="true" label="(optional) A second orthology-pairs / RBH file" help="If you provide a second file, then difference is calculated (GRAPH - second GRAPH)"/>
     </inputs>
     <outputs>
+        <data name="distribution" format="tabular" label="${tool.name} on ${on_string}: Protein-Group distribution"/>
         <data name="adjacencyMat" format="tabular" label="${tool.name} on ${on_string}: Adjacency Matrix"/>
         <data name="average1paths" format="tabular" label="${tool.name} on ${on_string}: Average number of Edges"/>
         <data name="adjacencyMatSquared" format="tabular" label="${tool.name} on ${on_string}: Matrix of 2-paths"/>
         <data name="average2paths" format="tabular" label="${tool.name} on ${on_string}: Average number of 2-paths"/>
     </outputs>
     <tests>
-        <test expect_num_outputs="4">
+        <test expect_num_outputs="5">
             <param name="queryfile" value="result.proteinortho-graph"/>
+            <output name="distribution">
+                <assert_contents>
+                    <has_text text="%"/>
+                </assert_contents>
+            </output>
             <output name="adjacencyMat">
                 <assert_contents>
                     <has_text text="18"/>
@@ -66,7 +70,7 @@
                 </assert_contents>
             </output>
         </test>
-        <test expect_num_outputs="4">
+        <test expect_num_outputs="5">
             <param name="queryfile" value="result.proteinortho-graph"/>
             <param name="queryfile2" value="result.blast-graph"/>
             <output name="average2paths">
@@ -77,13 +81,24 @@
                 </assert_contents>
             </output>
         </test>
+        <test expect_num_outputs="5">
+            <param name="queryfile" value="result.blast-graph"/>
+            <output name="average2paths">
+                <assert_contents>
+                    <has_text text="115.2"/>
+                    <has_text text="TERM" negate="true"/>
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[proteinortho summary
 
 **What it does**
 
 proteinortho_summary : Summaries the (orthology-pairs/RBH) file(s) to determine how well the species are connected to each other.
 
+ * **Protein-Group distribution** (for orthology-pairs) : This report contains overal statistics about the output. (i) Number of groups that contains at least p% input species (with p ranging between 0 and 100). (ii) number of groups for each input species.
+
  * **Adjacency Matrix** : How well are the species connected to each other directly.
 
  * **Average number of Edges** : Averaged number of connections for each species.
@@ -92,7 +107,6 @@ proteinortho_summary : Summaries the (orthology-pairs/RBH) file(s) to determine
 
  * **Average number of 2-paths** : The average number of 2-paths for each species. If a species is not well connected to all the other species, it will result in a low average.
 
-
 If you supply a second orthology-pairs/RBH then the difference is calculated for all 4 outputs.
 
 E.g. given the RBH and the orthology-pairs of the same run : The outputs show how much the clustering removed from the initial reciprocal best hit graph.