files v1.0

rderelle · Nov 26, 2019 · 20b6aa5 · 20b6aa5
1 parent 2e5d36a
commit 20b6aa5
Show file tree

Hide file tree

Showing 16 changed files with 3,624 additions and 2 deletions.
diff --git a/Manual_broccoli_v1.0.pdf b/Manual_broccoli_v1.0.pdf
diff --git a/README.md b/README.md
@@ -1,2 +1,48 @@
-# Broccoli
-orthology assignment using phylogenetic and network analyses
+
+
+<p align="center">
+  <img width="300" height="auto" src="./images/logo_broccoli.png">
+</p>
+
+## Overview
+
+Broccoli, a user-friendly pipeline designed to infer with high precision orthologous groups and pairs of proteins using a phylogeny-based approach. Briefly, Broccoli performs ultra-fast phylogenetic analyses on most proteins and builds a network of orthologous relationships. Orthologous groups are then identified from the network using a parameter-free machine learning algorithm (label propagation). Broccoli is also able to detect chimeric proteins resulting from gene-fusion events and to assign these proteins to the corresponding orthologous groups. 
+
+__Reference:__ <a href="">insert reference</a>
+
+<p align="center">
+  <img width="650" height="auto" src="./images/overview_broccoli.png">
+</p>
+
+
+## Requirements
+To run Broccoli, you need (see the [**manual**](manual_Broccoli.pdf) for installation advices):
+- a Unix system (MacOS or Linux)
+- Python version 3.6 or above
+- <a href="https://github.com/etetoolkit/ete">ete3 library</a>
+- <a href="https://github.com/bbuchfink/diamond">Diamond</a> version 0.9.25 or above
+- <a href="http://www.microbesonline.org/fasttree/">FastTree2</a>
+
+
+## Running Broccoli
+All parameters and options are available using the `-help` argument (see also the [**manual**](manual_Broccoli.pdf) for more details):
+```
+python broccoli.py -help
+```
+To test Broccoli with the small example dataset present in the directory `example_dataset` (30 sec to 1mn):
+```
+python broccoli.py -dir example_dataset
+```
+Broccoli will store the temporary and output files in 4 directories named `dir_step1` to `dir_step4` (one for each step) located in the current directory.
+In this test run, Broccoli should identify 227 orthologous groups, 1 chimeric protein and 863 orthologous pairs.
+
+
+
+## Licence
+This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
+
+See "LICENSE" for full terms and conditions of usage.
diff --git a/broccoli.py b/broccoli.py
@@ -0,0 +1,171 @@
+'''
+    This file is part of Broccoli.
+
+    Broccoli is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Broccoli is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Broccoli.  If not, see <https://www.gnu.org/licenses/>.
+    
+    contact: [email protected]
+
+'''
+
+
+import argparse
+import os
+import sys
+import shutil
+from scripts import broccoli_step1
+from scripts import broccoli_step2
+from scripts import broccoli_step3
+from scripts import broccoli_step4
+
+
+######################### functions
+
+def parse_args():
+    # define and parse command-line arguments
+    parser = argparse.ArgumentParser(description='            Broccoli v1.0', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n')
+
+    common = parser.add_argument_group(' general options')
+    common.add_argument('-steps',         help='steps to be performed, comma separated (default = \'1,2,3,4\')', metavar='', type=str, default='1,2,3,4')    
+    common.add_argument('-threads',       help='number of threads [default = 1]', metavar='', type=int, default=1)
+    common.add_argument('-h','-help',     action="help", help="show this help message and exit")
+
+    step1 = parser.add_argument_group(' STEP 1  kmer clustering')
+    step1.add_argument('-dir',            help='name of the directory containing the proteome files [required]', metavar='')
+    step1.add_argument('-ext',            help='extension of proteome files (default = \'.fasta\')', metavar='', type=str, default='.fasta')    
+    step1.add_argument('-min_length',     help='minimum length of sequences [default = 10]', metavar='', type=int, default=10)
+    step1.add_argument('-kmer_size',      help='length of kmers [default = 100]', metavar='', type=int, default=100)
+    step1.add_argument('-kmer_min_aa',    help='minimum nb of different aa a kmer should have [default = 15]', metavar='', type=int, default=15)
+
+    step2 = parser.add_argument_group(' STEP 2  phylomes')
+    step2.add_argument('-path_diamond',   help='path of DIAMOND with filename [default = \'diamond\']', metavar='', type=str, default='diamond')    
+    step2.add_argument('-path_fasttree',  help='path of FastTree with filename [default = \'fasttree\']', metavar='', type=str, default='fasttree')    
+    step2.add_argument('-e_value',        help='e-value for similarity search [default = 0.001]', metavar='', type=float, default=0.001)
+    step2.add_argument('-nb_hits',        help='maximum nb of hits per species [default = 6]', metavar='', type=int, default=6)
+    step2.add_argument('-max_gap',        help='maximum fraction of gap per position [default = 0.7]', metavar='', type=float, default=0.7)
+
+    step3 = parser.add_argument_group(' STEP 3  network analysis')
+    step3.add_argument('-min_nb_hits',    help='minimum number of hits belonging to the OG [default = 2]', metavar='', type=int, default=2)
+    step3.add_argument('-fusion_shared',  help='minimum fraction of connected nodes in each OG [default = 0.5]', metavar='', type=float, default=0.5)
+    step3.add_argument('-fusion_nb_sp',   help='minimum nb of species in OGs involved in gene-fusions [default = 2]', metavar='', type=int, default=2)
+    step3.add_argument('-fusion_overlap', help='maximum overlap between OGs in amino-acids [default = 10]', metavar='', type=int, default=10)
+
+    step4 = parser.add_argument_group(' STEP 4  orthologous pairs')
+    step4.add_argument('-ratio_ortho',    help='limit ratio ortho/total [default = 0.5]', metavar='', type=float, default=0.5)
+    step4.add_argument('-not_same_sp',    help='ignore ortho relationships between proteins of the same species (QfO benchmark)', action="store_true")
+
+    args = parser.parse_args()
+
+    # clean directory name
+    if args.dir:
+        args.dir = clean_dir_name(args.dir)
+
+    return args.steps, args.threads, \
+    args.dir, args.ext, args.min_length, args.kmer_size, args.kmer_min_aa, \
+    args.e_value, args.nb_hits, args.path_diamond, args.path_fasttree, args.max_gap, \
+    args.min_nb_hits, args.fusion_shared, args.fusion_overlap, args.fusion_nb_sp, \
+    args.ratio_ortho, args.not_same_sp
+
+
+
+def clean_dir_name(d):
+    d = d.replace('./','')
+    d = d.replace('/','')
+    d = './' + d + '/'
+    return d.replace('//','/')
+
+
+def check_python_version():
+    if sys.version_info[0] != 3 and sys.version_info[1] < 6:
+        sys.exit('\n            ERROR: your python is version '+ str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + ', please use version 3.6+\n\n')
+
+
+def parse_steps(p):
+    # split the steps and check them
+    l = p.split(',')
+    try:
+        s = {int(x) for x in l}
+    except:
+        sys.exit('\n            ERROR: the list of steps should be composed of integers (-steps option)\n\n')
+    # check for consecutiveness
+    range_steps = max(s) - min(s)
+    if range_steps != (len(s) - 1):
+        sys.exit('\n            ERROR: the steps should be consecutive (-steps option)\n\n')
+    return s
+
+
+def pre_checking_pgms(p_diamond, p_fasttree):
+    # check diamond
+    if '/' in p_diamond:
+        if not os.path.isfile(p_diamond):
+            sys.exit("\n            ERROR: the path to DIAMOND is incorrect\n\n")
+    elif not shutil.which(p_diamond):
+        sys.exit("\n            ERROR: the path to DIAMOND is incorrect\n\n")
+
+    # check FastTree
+    if '/' in p_fasttree:
+        if not os.path.isfile(p_fasttree):
+            sys.exit("\n            ERROR: the path to FastTree is incorrect\n\n")
+    elif not shutil.which(p_fasttree):
+        sys.exit("\n            ERROR: the path to FastTree is incorrect\n\n")
+
+
+######################### main part
+
+
+if __name__ == "__main__":
+
+    ## get all arguments
+    pre_steps, nb_threads, \
+    directory, extension, min_seq, length_kmer, min_aa, \
+    evalue, max_per_species, path_diamond, path_fasttree, trim_thres, \
+    min_nb_hits, limit_shared, limit_overlap, limit_nb_sp, \
+    limit_ortho, not_same_sp = parse_args()
+
+
+    print('\n            Broccoli v1.0\n')
+
+
+    ## check python version
+    check_python_version()
+
+    ## parse steps
+    steps = parse_steps(pre_steps)
+
+    ## check if -dir option (cases of 1st step)
+    if 1 in steps and directory is None:
+        sys.exit('\n            ERROR: you need to specify an input directory (see -help)\n\n')
+
+    ## check path of executables if step 2 (diamond, fasttree)
+    if 2 in steps:
+        pre_checking_pgms(path_diamond, path_fasttree)
+
+    ## execute the steps
+    if 1 in steps:
+        broccoli_step1.step1_kmer_clustering(directory, extension, min_seq, length_kmer, min_aa, nb_threads)
+
+    if 2 in steps:
+        broccoli_step2.step2_phylomes(evalue, max_per_species, path_diamond, path_fasttree, trim_thres, nb_threads)
+
+    if 3 in steps:
+        broccoli_step3.step3_orthology_network(min_nb_hits, limit_shared, limit_overlap, limit_nb_sp, nb_threads)
+
+    if 4 in steps:
+        broccoli_step4.step4_orthologous_pairs(limit_ortho, not_same_sp, nb_threads)
+
+
+
+
+
+
+