-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
3,624 additions
and
2 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,48 @@ | ||
# Broccoli | ||
orthology assignment using phylogenetic and network analyses | ||
|
||
|
||
<p align="center"> | ||
<img width="300" height="auto" src="./images/logo_broccoli.png"> | ||
</p> | ||
|
||
## Overview | ||
|
||
Broccoli, a user-friendly pipeline designed to infer with high precision orthologous groups and pairs of proteins using a phylogeny-based approach. Briefly, Broccoli performs ultra-fast phylogenetic analyses on most proteins and builds a network of orthologous relationships. Orthologous groups are then identified from the network using a parameter-free machine learning algorithm (label propagation). Broccoli is also able to detect chimeric proteins resulting from gene-fusion events and to assign these proteins to the corresponding orthologous groups. | ||
|
||
__Reference:__ <a href="">insert reference</a> | ||
|
||
<p align="center"> | ||
<img width="650" height="auto" src="./images/overview_broccoli.png"> | ||
</p> | ||
|
||
|
||
## Requirements | ||
To run Broccoli, you need (see the [**manual**](manual_Broccoli.pdf) for installation advices): | ||
- a Unix system (MacOS or Linux) | ||
- Python version 3.6 or above | ||
- <a href="https://github.com/etetoolkit/ete">ete3 library</a> | ||
- <a href="https://github.com/bbuchfink/diamond">Diamond</a> version 0.9.25 or above | ||
- <a href="http://www.microbesonline.org/fasttree/">FastTree2</a> | ||
|
||
|
||
## Running Broccoli | ||
All parameters and options are available using the `-help` argument (see also the [**manual**](manual_Broccoli.pdf) for more details): | ||
``` | ||
python broccoli.py -help | ||
``` | ||
To test Broccoli with the small example dataset present in the directory `example_dataset` (30 sec to 1mn): | ||
``` | ||
python broccoli.py -dir example_dataset | ||
``` | ||
Broccoli will store the temporary and output files in 4 directories named `dir_step1` to `dir_step4` (one for each step) located in the current directory. | ||
In this test run, Broccoli should identify 227 orthologous groups, 1 chimeric protein and 863 orthologous pairs. | ||
|
||
|
||
|
||
## Licence | ||
This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. | ||
|
||
See "LICENSE" for full terms and conditions of usage. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
''' | ||
This file is part of Broccoli. | ||
Broccoli is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
Broccoli is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
You should have received a copy of the GNU General Public License | ||
along with Broccoli. If not, see <https://www.gnu.org/licenses/>. | ||
contact: [email protected] | ||
''' | ||
|
||
|
||
import argparse | ||
import os | ||
import sys | ||
import shutil | ||
from scripts import broccoli_step1 | ||
from scripts import broccoli_step2 | ||
from scripts import broccoli_step3 | ||
from scripts import broccoli_step4 | ||
|
||
|
||
######################### functions | ||
|
||
def parse_args(): | ||
# define and parse command-line arguments | ||
parser = argparse.ArgumentParser(description=' Broccoli v1.0', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n') | ||
|
||
common = parser.add_argument_group(' general options') | ||
common.add_argument('-steps', help='steps to be performed, comma separated (default = \'1,2,3,4\')', metavar='', type=str, default='1,2,3,4') | ||
common.add_argument('-threads', help='number of threads [default = 1]', metavar='', type=int, default=1) | ||
common.add_argument('-h','-help', action="help", help="show this help message and exit") | ||
|
||
step1 = parser.add_argument_group(' STEP 1 kmer clustering') | ||
step1.add_argument('-dir', help='name of the directory containing the proteome files [required]', metavar='') | ||
step1.add_argument('-ext', help='extension of proteome files (default = \'.fasta\')', metavar='', type=str, default='.fasta') | ||
step1.add_argument('-min_length', help='minimum length of sequences [default = 10]', metavar='', type=int, default=10) | ||
step1.add_argument('-kmer_size', help='length of kmers [default = 100]', metavar='', type=int, default=100) | ||
step1.add_argument('-kmer_min_aa', help='minimum nb of different aa a kmer should have [default = 15]', metavar='', type=int, default=15) | ||
|
||
step2 = parser.add_argument_group(' STEP 2 phylomes') | ||
step2.add_argument('-path_diamond', help='path of DIAMOND with filename [default = \'diamond\']', metavar='', type=str, default='diamond') | ||
step2.add_argument('-path_fasttree', help='path of FastTree with filename [default = \'fasttree\']', metavar='', type=str, default='fasttree') | ||
step2.add_argument('-e_value', help='e-value for similarity search [default = 0.001]', metavar='', type=float, default=0.001) | ||
step2.add_argument('-nb_hits', help='maximum nb of hits per species [default = 6]', metavar='', type=int, default=6) | ||
step2.add_argument('-max_gap', help='maximum fraction of gap per position [default = 0.7]', metavar='', type=float, default=0.7) | ||
|
||
step3 = parser.add_argument_group(' STEP 3 network analysis') | ||
step3.add_argument('-min_nb_hits', help='minimum number of hits belonging to the OG [default = 2]', metavar='', type=int, default=2) | ||
step3.add_argument('-fusion_shared', help='minimum fraction of connected nodes in each OG [default = 0.5]', metavar='', type=float, default=0.5) | ||
step3.add_argument('-fusion_nb_sp', help='minimum nb of species in OGs involved in gene-fusions [default = 2]', metavar='', type=int, default=2) | ||
step3.add_argument('-fusion_overlap', help='maximum overlap between OGs in amino-acids [default = 10]', metavar='', type=int, default=10) | ||
|
||
step4 = parser.add_argument_group(' STEP 4 orthologous pairs') | ||
step4.add_argument('-ratio_ortho', help='limit ratio ortho/total [default = 0.5]', metavar='', type=float, default=0.5) | ||
step4.add_argument('-not_same_sp', help='ignore ortho relationships between proteins of the same species (QfO benchmark)', action="store_true") | ||
|
||
args = parser.parse_args() | ||
|
||
# clean directory name | ||
if args.dir: | ||
args.dir = clean_dir_name(args.dir) | ||
|
||
return args.steps, args.threads, \ | ||
args.dir, args.ext, args.min_length, args.kmer_size, args.kmer_min_aa, \ | ||
args.e_value, args.nb_hits, args.path_diamond, args.path_fasttree, args.max_gap, \ | ||
args.min_nb_hits, args.fusion_shared, args.fusion_overlap, args.fusion_nb_sp, \ | ||
args.ratio_ortho, args.not_same_sp | ||
|
||
|
||
|
||
def clean_dir_name(d): | ||
d = d.replace('./','') | ||
d = d.replace('/','') | ||
d = './' + d + '/' | ||
return d.replace('//','/') | ||
|
||
|
||
def check_python_version(): | ||
if sys.version_info[0] != 3 and sys.version_info[1] < 6: | ||
sys.exit('\n ERROR: your python is version '+ str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + ', please use version 3.6+\n\n') | ||
|
||
|
||
def parse_steps(p): | ||
# split the steps and check them | ||
l = p.split(',') | ||
try: | ||
s = {int(x) for x in l} | ||
except: | ||
sys.exit('\n ERROR: the list of steps should be composed of integers (-steps option)\n\n') | ||
# check for consecutiveness | ||
range_steps = max(s) - min(s) | ||
if range_steps != (len(s) - 1): | ||
sys.exit('\n ERROR: the steps should be consecutive (-steps option)\n\n') | ||
return s | ||
|
||
|
||
def pre_checking_pgms(p_diamond, p_fasttree): | ||
# check diamond | ||
if '/' in p_diamond: | ||
if not os.path.isfile(p_diamond): | ||
sys.exit("\n ERROR: the path to DIAMOND is incorrect\n\n") | ||
elif not shutil.which(p_diamond): | ||
sys.exit("\n ERROR: the path to DIAMOND is incorrect\n\n") | ||
|
||
# check FastTree | ||
if '/' in p_fasttree: | ||
if not os.path.isfile(p_fasttree): | ||
sys.exit("\n ERROR: the path to FastTree is incorrect\n\n") | ||
elif not shutil.which(p_fasttree): | ||
sys.exit("\n ERROR: the path to FastTree is incorrect\n\n") | ||
|
||
|
||
######################### main part | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
## get all arguments | ||
pre_steps, nb_threads, \ | ||
directory, extension, min_seq, length_kmer, min_aa, \ | ||
evalue, max_per_species, path_diamond, path_fasttree, trim_thres, \ | ||
min_nb_hits, limit_shared, limit_overlap, limit_nb_sp, \ | ||
limit_ortho, not_same_sp = parse_args() | ||
|
||
|
||
print('\n Broccoli v1.0\n') | ||
|
||
|
||
## check python version | ||
check_python_version() | ||
|
||
## parse steps | ||
steps = parse_steps(pre_steps) | ||
|
||
## check if -dir option (cases of 1st step) | ||
if 1 in steps and directory is None: | ||
sys.exit('\n ERROR: you need to specify an input directory (see -help)\n\n') | ||
|
||
## check path of executables if step 2 (diamond, fasttree) | ||
if 2 in steps: | ||
pre_checking_pgms(path_diamond, path_fasttree) | ||
|
||
## execute the steps | ||
if 1 in steps: | ||
broccoli_step1.step1_kmer_clustering(directory, extension, min_seq, length_kmer, min_aa, nb_threads) | ||
|
||
if 2 in steps: | ||
broccoli_step2.step2_phylomes(evalue, max_per_species, path_diamond, path_fasttree, trim_thres, nb_threads) | ||
|
||
if 3 in steps: | ||
broccoli_step3.step3_orthology_network(min_nb_hits, limit_shared, limit_overlap, limit_nb_sp, nb_threads) | ||
|
||
if 4 in steps: | ||
broccoli_step4.step4_orthologous_pairs(limit_ortho, not_same_sp, nb_threads) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.