diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 3670473cf..000000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -db/** filter=lfs diff=lfs merge=lfs -text diff --git a/docker/Dockerfile b/Dockerfile similarity index 100% rename from docker/Dockerfile rename to Dockerfile diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 48ce2a3bd..7be1bbfbd 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "dbcan" %} -{% set version = "4.1.0" %} +{% set version = "4.1.2" %} package: name: "{{ name|lower }}" @@ -9,7 +9,7 @@ source: # the sha256 sum is generated by doing # wget -0- [URL] | shasum -a 256 url: https://github.com/linnabrown/run_dbcan/releases/download/{{ version }}/dbcan-{{ version }}.tar.gz - sha256: cb0907eb10eb916bcf676c58f54e67a67dd4ed559152e5547bb44d071f063b8f + sha256: 3a675683379d1afc9f3444fc9894272f1485956df266a6ee4fc11a8f628e6d51 build: number: 0 @@ -22,6 +22,7 @@ build: - dbcan_utils = dbcan.utils.utils:main - dbcan_plot = dbcan.utils.plots:main - dbcan_asmfree = dbcan.utils.diamond_unassembly:main + - dbcan_build = dbcan.utils.dbcan_build:main run_exports: - {{ pin_subpackage(name, max_pin="x") }} @@ -41,6 +42,9 @@ requirements: - numpy >1.19 - biopython - pandas + - tqdm + - openpyxl + - matplotlib-base - session-info test: diff --git a/dbcan.yml b/dbcan.yml index 863d385bd..18264048f 100644 --- a/dbcan.yml +++ b/dbcan.yml @@ -28,4 +28,5 @@ dependencies: - matplotlib - openpyxl - biopython - - bbtools \ No newline at end of file + - bbtools + - seaborn \ No newline at end of file diff --git a/dbcan/cli/run_dbcan.py b/dbcan/cli/run_dbcan.py index 9e1ea3075..e45ca9e15 100755 --- a/dbcan/cli/run_dbcan.py +++ b/dbcan/cli/run_dbcan.py @@ -13,7 +13,7 @@ import time # Recent updated information: -# Jan/01/23: Add doc code +# Jan/01/23: Add doc code [Haidong Yi, Le Huang] # Oct/10/23: Recontructed the run_dbcan [Haidong Yi] # Sep/07/23: Replace hmmscan with hmmsearch. Update perl code [Le Huang, Yanbin Yin] # Dec/15/22: 1.adding function to convert cgc_standard.out to json format. 2. adding function cgc_[Jinfang Zheng] @@ -60,8 +60,21 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name): hmm_file = f"{dbDir}{db_name}.hmm" uniInput_file = f"{outPath}uniInput" - hmmer = Popen( - [ + # hmmer = Popen( + # [ + # "hmmsearch", + # "--domtblout", + # domtblout_file, + # "--cpu", + # str(hmm_cpu), + # "-o", + # "/dev/null", + # hmm_file, + # uniInput_file, + # ] + # ) + # hmmer.wait() + hmmer_list = [ "hmmsearch", "--domtblout", domtblout_file, @@ -72,8 +85,9 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name): hmm_file, uniInput_file, ] - ) - hmmer.wait() + cmd_str = " ".join(hmmer_list) + os.system(cmd_str) + parsed_hmm_output = hmmer_parser.run(input_file=f"{outPath}h{db_name}.out", eval_num=hmm_eval, coverage=hmm_cov) with open(f"{outPath}{db_name}.out", "w") as f: f.write(parsed_hmm_output) @@ -108,85 +122,40 @@ def split_uniInput(uniInput, dbcan_thread, outPath, dbDir, hmm_eval, hmm_cov, hm - Time taken for execution is printed at the end of the function's run. """ ticks = time.time() - file = open(uniInput) - uniInput_file = file.readlines() - file.close() - signal_count = 0 - min_files = dbcan_thread - file_number = None - split_files = [] - fsize = int(os.path.getsize(uniInput) / float(1024 * 1024) * dbcan_offset) - - if fsize < 1: - fsize = 1 - - for line in uniInput_file: - if ">" in line: - signal_count += 1 - print(f"Count of proteins: {signal_count}") - - if signal_count >= min_files: - for i in range(fsize): - with open(f"{outPath}{i}.txt", "w") as f: - pass - split_files.append(f"{i}.txt") - for i in range(len(uniInput_file)): - if ">" in uniInput_file[i]: - file_number = i % fsize - with open(f"{outPath}{file_number}.txt", "a") as f: - f.write(uniInput_file[i]) - - ths = [] - for j in split_files: - ths.append( - Popen( - [ - "hmmsearch", - "--domtblout", - f"{outPath}d{j}", - "--cpu", - str(hmm_cpu), - "-o", - "/dev/null", - f"{dbDir}dbCAN_sub.hmm", - f"{outPath}{j}", - ] - ) - ) - for th in ths: - th.wait() - for m in split_files: - hmm_parser_output = hmmer_parser.run(f"{outPath}d{m}", eval_num=hmm_eval, coverage=hmm_cov) - with open(f"{outPath}temp_{m}", "w") as temp_hmmer_file: - temp_hmmer_file.write(hmm_parser_output) - os.remove(f"{outPath}d{m}") - os.remove(f"{outPath}{m}") - with open(f"{outPath}dtemp.out", "w"): pass - for n in split_files: - with open(f"{outPath}temp_{n}") as file_read: - files_lines = file_read.readlines() - os.remove(f"{outPath}temp_{n}") - with open(f"{outPath}dtemp.out", "a") as f: - f.writelines(files_lines) - else: - dbsub = Popen( - [ - "hmmsearch", - "--domtblout", - f"{outPath}d.txt", - "--cpu", - str(hmm_cpu), - "-o", - "/dev/null", - f"{dbDir}dbCAN_sub.hmm", - f"{outPath}uniInput", - ] - ) - dbsub.wait() + + # dbsub = Popen( + # [ + # "hmmsearch", + # "--domtblout", + # f"{outPath}d.txt", + # "--cpu", + # str(hmm_cpu), + # "-o", + # "/dev/null", + # f"{dbDir}dbCAN_sub.hmm", + # f"{outPath}uniInput", + # ] + # ) + # dbsub.wait() + + dbsub_list = [ + "hmmsearch", + "--domtblout", + f"{outPath}d.txt", + "--cpu", + str(hmm_cpu), + "-o", + "/dev/null", + f"{dbDir}dbCAN_sub.hmm", + f"{outPath}uniInput", + ] + + dbsub_str = " ".join(dbsub_list) + os.system(dbsub_str) - hmm_parser_output = hmmer_parser.run(f"{outPath}d.txt", eval_num=hmm_eval, coverage=hmm_cov) - with open(f"{outPath}dtemp.out", "w") as temp_hmmer_file: - temp_hmmer_file.write(hmm_parser_output) + hmm_parser_output = hmmer_parser.run(f"{outPath}d.txt", eval_num=hmm_eval, coverage=hmm_cov) + with open(f"{outPath}dtemp.out", "w") as temp_hmmer_file: + temp_hmmer_file.write(hmm_parser_output) print("total time:", time.time() - ticks) @@ -197,10 +166,10 @@ def run_dbCAN( cluster=None, dbCANFile="dbCAN.txt", dia_eval=1e-102, - dia_cpu=4, + dia_cpu=8, hmm_eval=1e-15, hmm_cov=0.35, - hmm_cpu=4, + hmm_cpu=8, dbcan_thread=5, dbcan_offset=2, tf_eval=1e-4, @@ -418,10 +387,10 @@ def run_dbCAN( processed_lines = [] for line in f: row = line.rstrip().split("\t") - row.append(float(int(row[6]) - int(row[5])) / int(row[1])) + row.append(str(float(int(row[6]) - int(row[5])) / int(row[1]))) if float(row[4]) <= 1e-15 and float(row[-1]) >= 0.35: - # out.write("\t".join([str(x) for x in row]) + "\n") - processed_lines.append("\t".join([str(x) for x in row])) + processed_lines.append("\t".join(row)) + # Process dbcan-sub.hmm.out content updated_lines = [line for line in processed_lines if line.strip()] @@ -713,8 +682,8 @@ def run_dbCAN( with open(auxFile) as f: with open(outDir + prefix + "cgc.gff", "w") as out: for line in f: - if not line.startswith("#"): - row = line.rstrip().split("\t") + row = line.rstrip().split("\t") + if (not line.startswith("#")) and len(row) >= 9: if row[2] == "CDS": note = row[8].strip().rstrip(";").split(";") gene = "" @@ -825,135 +794,98 @@ def run_dbCAN( # End SignalP combination ####################### ####################### - # start Overview + # start overview Generation + def unique(sequence): + """ Remove duplicates from a list while keeping the original order. """ + seen = set() + return [x for x in sequence if not (x in seen or seen.add(x))] + + def read_lines_if_file_exists(filepath): + """ Read non-empty lines from a file if it exists, return an empty list otherwise. """ + if os.path.exists(filepath): + with open(filepath, 'r') as file: + return [line.strip() for line in file if line.strip()] + return [] + print("Preparing overview table from hmmer, dbCAN_sub and diamond output...") - workdir = outDir + prefix - - # a function to remove duplicates from lists while keeping original order - def unique(seq): - exists = set() - return [x for x in seq if not (x in exists or exists.add(x))] - - arr_dbsub = None - arr_hmmer = None - - # check if files exist. if so, read files and get the gene numbers - if tools[0]: - arr_diamond = open(workdir + "diamond.out").readlines() - diamond_genes = [arr_diamond[i].split()[0] for i in range(1, len(arr_diamond))] # or diamond_genes = [] - - if tools[1]: - arr_hmmer = open(workdir + "hmmer.out").readlines() - hmmer_genes = [arr_hmmer[i].split()[2] for i in range(1, len(arr_hmmer))] # or hmmer_genes = [] - - if tools[2]: - arr_dbsub = open(workdir + "dbcan-sub.hmm.out").readlines() - dbsub_genes = [arr_dbsub[i].split("\t")[5] for i in range(1, len(arr_dbsub))] # or dbsub_genes = [] - - if use_signalP and (os.path.exists(workdir + "signalp.out")): - arr_sigp = open(workdir + "signalp.out").readlines() - sigp_genes = {} - for i in range(0, len(arr_sigp)): - row = arr_sigp[i].split() - sigp_genes[row[0]] = row[4] # previous one is row[2], use Y-score instead from suggestion of Dongyao Li - - # remove duplicates from input lists - if not tools[0]: - diamond_genes = [] - if not tools[1]: - hmmer_genes = [] - if not tools[2]: - dbsub_genes = [] - - if len(dbsub_genes) > 0: - if dbsub_genes[-1] is None: - dbsub_genes.pop() - dbsub_genes = unique(dbsub_genes) - if "hmmer_genes" in locals(): - hmmer_genes.pop() - hmmer_genes = unique(hmmer_genes) - if "diamond_genes" in locals(): - diamond_genes.pop() - diamond_genes = unique(diamond_genes) - - # parse input, stroe needed variables - if tools[0] and (len(arr_diamond) > 1): - diamond_fams = {} - for i in range(1, len(arr_diamond)): - row = arr_diamond[i].split("\t") - fam = row[1].strip("|").split("|") - diamond_fams[row[0]] = fam[1:] - - if tools[1] and (len(arr_hmmer) > 1): - hmmer_fams = {} - for i in range(1, len(arr_hmmer)): - row = arr_hmmer[i].split("\t") - fam = row[0].split(".") - fam = fam[0] + "(" + row[7] + "-" + row[8] + ")" - if row[2] not in hmmer_fams: - hmmer_fams[row[2]] = [] - hmmer_fams[row[2]].append(fam) - - if tools[2] and (len(arr_dbsub) > 1): - dbsub_fams = {} - for i in range(1, len(arr_dbsub)): - row_ori = arr_dbsub[i].split("\t") - fams_ID = row_ori[5] - if fams_ID not in dbsub_fams: - dbsub_fams[fams_ID] = {} - dbsub_fams[fams_ID]["fam_name"] = [] - dbsub_fams[fams_ID]["ec_num"] = [] - - dbsub_fams[fams_ID]["fam_name"].append(row_ori[0]) - dbsub_fams[fams_ID]["ec_num"].append(row_ori[2]) - - # overall table + arr_diamond = read_lines_if_file_exists(outPath + "diamond.out") + arr_hmmer = read_lines_if_file_exists(outPath + "hmmer.out") + arr_dbsub = read_lines_if_file_exists(outPath + "dbcan-sub.hmm.out") + + diamond_genes = unique([line.split()[0] for line in arr_diamond[1:]]) + hmmer_genes = unique([line.split()[2] for line in arr_hmmer[1:]]) + dbsub_genes = unique([line.split("\t")[5] for line in arr_dbsub[1:]]) + + sigp_genes = {} + if use_signalP and os.path.exists(outPath + "signalp.out"): + arr_sigp = read_lines_if_file_exists(outPath + "signalp.out") + for line in arr_sigp: + parts = line.split() + sigp_genes[parts[0]] = parts[4] + + diamond_fams, hmmer_fams, dbsub_fams = {}, {}, {} + + if tools[0]: # Diamond + diamond_fams = {row.split("\t")[0]: row.split("\t")[1].strip("|").split("|")[1:] for row in arr_diamond[1:]} + + if tools[1]: # Hmmer + for row in arr_hmmer[1:]: + parts = row.split("\t") + fam = parts[0].split(".")[0] + "(" + parts[7] + "-" + parts[8] + ")" + hmmer_fams.setdefault(parts[2], []).append(fam) + + if tools[2]: # dbCAN_sub + for row in arr_dbsub[1:]: + parts = row.split("\t") + gene_id = parts[5] + dbsub_fams.setdefault(gene_id, {"fam_name": [], "ec_num": []}) + dbsub_fams[gene_id]["fam_name"].append(parts[0]) + dbsub_fams[gene_id]["ec_num"].append(parts[2]) + # Create the overview table all_genes = unique(hmmer_genes + dbsub_genes + diamond_genes) - with open(workdir + "overview.txt", "w+") as fp: + with open(outPath + "overview.txt", "w+") as fp: + headers = ["Gene ID", "EC#", "HMMER", "dbCAN_sub", "DIAMOND", "#ofTools"] if use_signalP: - fp.write("Gene ID\tEC#\tHMMER\tdbCAN_sub\tDIAMOND\tSignalp\t#ofTools\n") - else: - fp.write("Gene ID\tEC#\tHMMER\tdbCAN_sub\tDIAMOND\t#ofTools\n") + headers.insert(5, "Signalp") + fp.write("\t".join(headers) + "\n") + for gene in all_genes: - csv = [gene] + row = [gene] num_tools = 0 - if tools[2] and arr_dbsub is not None and (gene in dbsub_genes): - if dbsub_fams[gene]["ec_num"] == []: - csv.append("-") - else: - csv.append("|".join(dbsub_fams[gene]["ec_num"])) - else: - csv.append("-") + # dbCAN_sub + row.append("|".join(dbsub_fams[gene]["ec_num"]) if gene in dbsub_fams else "-") - if tools[1] and arr_hmmer is not None and (gene in hmmer_genes): + # HMMER + if gene in hmmer_fams: num_tools += 1 - csv.append("+".join(hmmer_fams[gene])) + row.append("+".join(hmmer_fams[gene])) else: - csv.append("-") - - if tools[2] and arr_dbsub is not None and (gene in dbsub_genes): + row.append("-") + + # dbCAN_sub (fam_name) + if tools[2] and gene in dbsub_fams: + row.append("+".join(dbsub_fams[gene]["fam_name"])) num_tools += 1 - csv.append("+".join(dbsub_fams[gene]["fam_name"])) else: - csv.append("-") + row.append("-") - if tools[0] and arr_diamond is not None and (gene in diamond_genes): + # DIAMOND + if gene in diamond_fams: num_tools += 1 - csv.append("+".join(diamond_fams[gene])) + row.append("+".join(diamond_fams[gene])) else: - csv.append("-") + row.append("-") + + # SignalP if use_signalP: - if gene in sigp_genes: - csv.append("Y(1-" + sigp_genes[gene] + ")") - else: - csv.append("N") - csv.append(str(num_tools)) - temp = "\t".join(csv) + "\n" - fp.write(temp) - print("overview table complete. Saved as " + workdir + "overview.txt") + row.append("Y(1-" + sigp_genes[gene] + ")" if gene in sigp_genes else "N") + + row.append(str(num_tools)) + fp.write("\t".join(row) + "\n") + print(f"Overview table complete. Saved as {outPath}overview.txt") # End overview @@ -1052,7 +984,7 @@ def rundbCAN_parser(): default="all", help="Choose gram+(p) or gram-(n) for proteome/prokaryote nucleotide, which are params of SingalP, only if user use singalP", ) - parser.add_argument("-v", "--version", default="4.1.0", type=str) + parser.add_argument("-v", "--version", default="4.1.1", type=str) # dbCAN-sub dbCAN_sub_group = parser.add_argument_group("dbCAN-sub parameters") dbCAN_sub_group.add_argument("--dbcan_thread", "-dt", default=12, type=int) @@ -1087,7 +1019,7 @@ def rundbCAN_parser(): cgcsubstrate_group.add_argument('--only_sub',action='store_false',help="Only run substrate prediction for PUL. If this parameter is presented, dbcan will skip the CAZyme annotation and CGC prediction.") cgcsubstrate_group.add_argument("--cgc_substrate", action="store_true", help="run cgc substrate prediction?") cgcsubstrate_group.add_argument("--pul", help="dbCAN-PUL PUL.faa") - cgcsubstrate_group.add_argument("-o", "--out", default="sub.prediction.out") + cgcsubstrate_group.add_argument("-o", "--out", default="substrate.out") cgcsubstrate_group.add_argument("-w", "--workdir", type=str, default=".") cgcsubstrate_group.add_argument("-env", "--env", type=str, default="local") cgcsubstrate_group.add_argument( diff --git a/dbcan/utils/dbcan_build.py b/dbcan/utils/dbcan_build.py new file mode 100644 index 000000000..ef09f442c --- /dev/null +++ b/dbcan/utils/dbcan_build.py @@ -0,0 +1,123 @@ +import os +import argparse +import requests +from tqdm import tqdm +import shutil +from concurrent.futures import ThreadPoolExecutor +import subprocess + + +def download_file(url, filename, position): + """使用requests下载文件并用tqdm显示进度条""" + response = requests.get(url, stream=True) + total_size = int(response.headers.get('content-length', 0)) + chunk_size = 1024 + with open(filename, 'wb') as f, tqdm( + desc=filename, + total=total_size, + unit='iB', + unit_scale=True, + position=position, + leave=False # 设置为False,下载完成后隐藏进度条 + ) as bar: + for chunk in response.iter_content(chunk_size=chunk_size): + size = f.write(chunk) + if size is not None: + bar.update(int(size)) + +def parse_args(): + """解析命令行参数""" + parser = argparse.ArgumentParser(description='dbCAN Database Downloader') + parser.add_argument('--cpus', type=int, default=8, help='Number of CPUs for parallel downloads') + parser.add_argument('--db-dir', type=str, default='db', help='Database directory name') + parser.add_argument('--clean', action='store_true', help='Clean up temporary files after processing') + return parser.parse_args() +def clean_up(db_dir): + try: + shutil.rmtree(db_dir) + print(f"Folder '{db_dir}' and all its contents have been deleted.") + except OSError as error: + print(f"Error: {error}") + print(f"Failed to delete the folder '{db_dir}'.") +def get_remote_file_size(url): + """获取远程文件的大小""" + response = requests.head(url, allow_redirects=True) + if 'content-length' in response.headers: + return int(response.headers['content-length']) + else: + return 0 +def run_command(cmd): + """运行单个shell命令""" + subprocess.run(cmd, shell=True, check=True) + +def main(): + args = parse_args() + + if args.clean and os.path.exists(args.db_dir): + clean_up(args.db_dir) + + if not os.path.exists(args.db_dir): + os.makedirs(args.db_dir) + os.chdir(args.db_dir) + + download_urls = [ + "https://bcb.unl.edu/dbCAN2/download/Databases/fam-substrate-mapping-08012023.tsv", + "https://bcb.unl.edu/dbCAN2/download/Databases/PUL.faa", + "https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.xlsx", + "https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz", + "https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm", + "https://bcb.unl.edu/dbCAN2/download/Databases/V12/CAZyDB.07262023.fa", + "https://bcb.unl.edu/dbCAN2/download/Databases/V12/dbCAN-HMMdb-V12.txt", + "https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa", + "https://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm", + "https://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm", + "https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm" + ] + + total_size = sum(get_remote_file_size(url) for url in download_urls) + + with ThreadPoolExecutor(max_workers=args.cpus) as executor: + futures = [] + for i, url in enumerate(download_urls): + filename = os.path.basename(url) + futures.append(executor.submit(download_file, url, filename, i)) + + # 显示总的进度条 + with tqdm(total=total_size, unit='iB', unit_scale=True, position=len(download_urls), leave=False) as bar: + for future in futures: + result = future.result() + if result is not None: + bar.update(int(result)) + + # 执行其他命令 + other_commands = [ + "mv fam-substrate-mapping-08012023.tsv fam-substrate-mapping.tsv", + "makeblastdb -in PUL.faa -dbtype prot", + "mv dbCAN-PUL_12-12-2023.xlsx dbCAN-PUL.xlsx", + "tar xzvf dbCAN-PUL.tar.gz", + "hmmpress -f dbCAN_sub.hmm", + "mv CAZyDB.07262023.fa CAZyDB.fa", + "diamond makedb --in CAZyDB.fa -d CAZy", + "mv dbCAN-HMMdb-V12.txt dbCAN.txt", + "hmmpress dbCAN.txt" + "diamond makedb --in tcdb.fa -d tcdb", + "hmmpress -f tf-1.hmm", + "hmmpress -f tf-2.hmm", + "hmmpress -f stp.hmm" + ] + for cmd in other_commands: + subprocess.run(cmd, shell=True) + + os.chdir('..') + # 下载样本文件 + sample_commands = [ + "wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.fna", + "wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.faa", + "wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.gff" + ] + + with ThreadPoolExecutor() as executor: + executor.map(run_command, sample_commands) + +if __name__ == "__main__": + main() diff --git a/dbcan/utils/plots.py b/dbcan/utils/plots.py index 422a512cf..11d52e9e0 100644 --- a/dbcan/utils/plots.py +++ b/dbcan/utils/plots.py @@ -15,10 +15,10 @@ from matplotlib.patches import Patch #plt.style.use('seaborn') from dbcan.utils.utils import cgc_standard_line -from dbcan_cli.syntenic_plot import syntenic_plot,read_blast_result_cgc,read_UHGG_CGC_stanrdard_out,read_PUL_cgcgff -from dbcan_cli.syntenic_plot import Get_parameters_for_plot,plot_Polygon_homologous,plot_syntenic_block -from dbcan_cli.syntenic_plot import Get_Position as synGet_Position -from dbcan_cli.syntenic_plot import plot_genome_line as synplot_genome_line +from dbcan.cli.syntenic_plot import syntenic_plot,read_blast_result_cgc,read_UHGG_CGC_stanrdard_out,read_PUL_cgcgff +from dbcan.cli.syntenic_plot import Get_parameters_for_plot,plot_Polygon_homologous,plot_syntenic_block +from dbcan.cli.syntenic_plot import Get_Position as synGet_Position +from dbcan.cli.syntenic_plot import plot_genome_line as synplot_genome_line import matplotlib as mpl mpl.rcParams['pdf.fonttype'] = 42 mpl.rcParams['ps.fonttype'] = 42 diff --git a/docker/cazyme_annotation/Dockerfile b/docker/cazyme_annotation/Dockerfile new file mode 100644 index 000000000..21510d658 --- /dev/null +++ b/docker/cazyme_annotation/Dockerfile @@ -0,0 +1,14 @@ +FROM continuumio/miniconda3 + +# set work dir +WORKDIR /app +COPY dbcan.yml . + +# install conda packages +RUN conda env create -f dbcan.yml && conda clean -ya + +# add run_dbcan environment to path +RUN echo "source activate CAZyme_annotation" > ~/.bashrc +ENV PATH /opt/conda/envs/CAZyme_annotation/bin:$PATH + +CMD [ "run_dbcan -h" ] diff --git a/docs/_static/img/Fig1.png b/docs/_static/img/Fig1.png new file mode 100644 index 000000000..49196ab24 Binary files /dev/null and b/docs/_static/img/Fig1.png differ diff --git a/docs/installation.rst b/docs/installation.rst index c0b0470d1..4fba40dd2 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -27,7 +27,7 @@ or `Miniconda `_. Then, you .. code-block:: shell - conda create --dbcan python=3.8 + conda create --name dbcan python=3.8 If you already have a ``conda`` environment, you can skip the step above. @@ -37,6 +37,26 @@ To install the `dbcan`_ package, use the ``conda install`` command: conda install dbcan -c conda-forge -c bioconda + +Build database +-------------- + +You can build database via this command: + +.. code-block:: shell + + dbcan_build --cpus 8 --db-dir db --clean + + +1. `--cpu` indicates count of cpu you can use. Try as many as possible for fast building. + +2. `--db-dir` indicates database folder path. Default is `db` on your current database + +3. `--clean` indicates clean the folder indicated by `--db-dir`. +You can remove this parameter if you don't want to clean, but we recommend you add this to keep +away from index contamination. + + Installing with PyPI -------------------- @@ -55,10 +75,12 @@ dependencies: Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process. **Installing SignalP (Optional)**: + - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows: 1. Visit the `SignalP website `_. 2. Submit a download `request `_. 3. Post-download, add `SignalP` to your system's environmental variables to make it executable. + - For installation assistance, refer to the :doc:`faq/signalp_installation`. This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it. diff --git a/docs/references.bib b/docs/references.bib index 8a8936742..b76464443 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -28,3 +28,43 @@ @article{2017:nielsen year={2017}, publisher={Springer} } + +@article{2021:Wastyk, + title={Gut-microbiota-targeted diets modulate human immune status}, + author={Wastyk, Hannah C and Fragiadakis, Gabriela K and Perelman, Dalia and Dahan, Dylan and Merrill, Bryan D and Feiqiao, B Yu and Topf, Madeline and Gonzalez, Carlos G and Van Treuren, William and Han, Shuo and others}, + journal={Cell}, + volume={184}, + number={16}, + pages={4137--4153}, + year={2021}, + publisher={Elsevier} +} + +@article{2023:Priest, + title={Carbohydrates and carbohydrate degradation gene abundance and transcription in Atlantic waters of the Arctic}, + author={Priest, Taylor and Vidal-Melgosa, Silvia and Hehemann, Jan-Hendrik and Amann, Rudolf and Fuchs, Bernhard M}, + journal={ISME communications}, + volume={3}, + number={1}, + pages={130}, + year={2023}, + publisher={Nature Publishing Group UK London} +} + +@article{2023:carter, + title={Ultra-deep sequencing of Hadza hunter-gatherers recovers vanishing gut microbes}, + author={Carter, Matthew M and Olm, Matthew R and Merrill, Bryan D and Dahan, Dylan and Tripathi, Surya and Spencer, Sean P and Feiqiao, B Yu and Jain, Sunit and Neff, Norma and Jha, Aashish R and others}, + journal={Cell}, + year={2023}, + publisher={Elsevier} +} + + +@article{2023:dbCAN3, + title={dbCAN3: automated carbohydrate-active enzyme and substrate annotation}, + author={Zheng, Jinfang and Ge, Qiwei and Yan, Yuchen and Zhang, Xinpeng and Huang, Le and Yin, Yanbin}, + journal={Nucleic Acids Research}, + pages={gkad328}, + year={2023}, + publisher={Oxford University Press} +} diff --git a/docs/user_guide/database_preparation.rst b/docs/user_guide/database_preparation.rst index 35775e4ed..368f123ac 100644 --- a/docs/user_guide/database_preparation.rst +++ b/docs/user_guide/database_preparation.rst @@ -1,16 +1,14 @@ -Database preparation -==================== - -Install different databases and make index for them. +Database Installation Command +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell + :name: include-this-part test -d db || mkdir db cd db \ && wget http://bcb.unl.edu/dbCAN2/download/Databases/fam-substrate-mapping-08012023.tsv && mv fam-substrate-mapping-08012023.tsv fam-substrate-mapping.tsv \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/PUL_12112023.faa && mv PUL_12112023.faa PUL.faa && makeblastdb -in PUL.faa -dbtype prot \ + && wget http://bcb.unl.edu/dbCAN2/download/Databases/PUL.faa && makeblastdb -in PUL.faa -dbtype prot \ && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.xlsx && mv dbCAN-PUL_12-12-2023.xlsx dbCAN-PUL.xlsx \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.txt && mv dbCAN-PUL_12-12-2023.txt dbCAN-PUL.txt \ && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz && tar xvf dbCAN-PUL.tar.gz && rm dbCAN-PUL.tar.gz \ && wget https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm && hmmpress dbCAN_sub.hmm \ && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/CAZyDB.07262023.fa && mv CAZyDB.07262023.fa CAZyDB.fa && diamond makedb --in CAZyDB.fa -d CAZy \ diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index d3e45c2ec..a7c5979e5 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -10,3 +10,5 @@ User Guide run_with_CGCFinder run_from_raw_reads run_from_DNA_sequence + run_from_raw_reads_ex + run_from_raw_reads_pr diff --git a/docs/user_guide/run_from_DNA_sequence.rst b/docs/user_guide/run_from_DNA_sequence.rst index 31438d08b..20ea72505 100644 --- a/docs/user_guide/run_from_DNA_sequence.rst +++ b/docs/user_guide/run_from_DNA_sequence.rst @@ -1,4 +1,4 @@ -Run from Protein Sequence +Run from DNA Sequence ========================= This section provides a quick guide to running the run_dbcan tool suite with example data and explains the output files generated. diff --git a/docs/user_guide/run_from_raw_reads.rst b/docs/user_guide/run_from_raw_reads.rst index 95d02f8a7..0d7d2ea66 100644 --- a/docs/user_guide/run_from_raw_reads.rst +++ b/docs/user_guide/run_from_raw_reads.rst @@ -1,64 +1,174 @@ -Run from Raw Reads -================== +Run from Raw Reads: Automated CAZyme and Glycan Substrate Annotation in Microbiomes: A Step-by-Step Protocol +============================================================================================================ Introduction ------------ -dbCAN and run_dbcan require assembled contigs for CAZyme annotation. -Typically, microbiome researchers begin with raw sequencing reads (metagenomic or metatranscriptomic) from various samples. -These reads must be pre-processed and assembled prior to annotation. -Additionally, there's often a need for CAZyme abundance comparison -and visualization across multiple samples. To address these requirements, -this protocol paper provides a comprehensive guide on CAZyme annotation. -It includes steps from initial sequencing reads to the visualization of CAZyme occurrence and abundance across samples. -Key topics covered are software setup, read pre-processing, metagenome assembly, gene prediction, -CAZyme and CGC prediction, glycan substrate prediction, and data visualization. +Overview +```````` -.. image:: ../_static/img/Picture1.png +In this tutorial, we present a comprehensive protocol to annotate CAZymes and glycan substrates in microbiome datasets. Using three real-world microbiome example datasets: + +1. :ref:`Carter2023 ` :cite:`2023:carter`, +2. :ref:`Wastyk2021 ` :cite:`2021:Wastyk` (See :doc:`run_from_raw_reads_ex`), and, +3. :ref:`Priest2023 ` :cite:`2023:Priest` (See :doc:`run_from_raw_reads_pr.rst`), + +this guide will walk you through each step of the computational workflow for analyzing occurrence and abundance. The workflow, depicted in Fig. 1, is designed to be user-friendly and does not require extensive programming knowledge. + +.. image:: ../_static/img/Fig1.png :alt: workflow figure - :width: 800px + :width: 700px :align: center +.. |centered-text| raw:: html -For this tutorial, we provide a comprehensive pipeline to teach users how to run CAZyme annotations from raw reads to generate abundance information. -We use Carter2023 and the individual sample assembly route of the figure above. The procedure has 4 modules and 16 steps (P1-P16). -First, we need to create the environment. +
Fig.1 Overview of the protocol
-Installation and Data Preparation ---------------------------------- +|centered-text| +Workflow Steps +`````````````` -1. Downloading Carter2023 (Table 2) Raw Reads +1. **Pre-Processing of Raw Sequencing Reads:** + Begin with the preprocessing of raw sequencing reads. This includes the removal of contaminants, adapter sequences, and trimming of low-quality reads. We'll use `trim_galore` and `Kraken2` for this purpose (Steps 1-2). +2. **Contig Assembly:** + The clean reads from each sample are then assembled into contigs using `MEGAHIT` (Step 3). -To download the required raw reads, use the following wget commands: +3. **Gene Model Annotation:** + These contigs are subsequently passed to `Prokka` for gene model annotation (Step 4). -.. code-block:: shell +4. **CAZyme and CGC Annotation:** + The next phase involves annotating the contigs for CAZymes and CGCs. This is achieved by `run_dbcan`, utilizing the protein sequence (faa) and gene annotation (gff) files produced by `Prokka` (Step 5). - wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Dry2014_1.fastq.gz - wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Dry2014_2.fastq.gz - wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Wet2014_1.fastq.gz - wget https://bcb.unl.edu/dbCAN_toturial/raw_reads/Wet2014_2.fastq.gz +5. **Location Mapping and Substrate Prediction:** + Step 6 involves mapping the location of annotated CAZymes and CGCs on the contigs. In Step 7, `run_dbcan`'s substrate prediction function infers glycan substrates for these CAZymes and CGCs. -2. Create Anaconda Environment +6. **Abundance Calculation:** + To quantify the abundance of CAZymes, substrates, and CGCs, clean reads from Step 2 are mapped to the nucleotide coding sequences (CDS) of proteins from Step 4 (Steps 8-14). +7. **Data Visualization:** + Finally, steps 15-20 focus on visualizing the occurrence and abundance results. We provide Python scripts for creating publication-quality plots in PDF format. -Create and activate a new Anaconda environment with the following steps: + +.. image:: ../_static/img/Picture1.png + :alt: workflow figure + :width: 800px + :align: center + +.. |centered-text2| raw:: html + +
Fig.2 Experimental design of CAZyme annotation in microbiomes
+ +|centered-text2| + +User Requirements +````````````````` + +This protocol is designed for users who are comfortable with the Linux command-line interface and can execute Python scripts in the terminal. While extensive programming experience is not necessary, users should be familiar with editing Linux commands and plain-text scripts within a command-line environment. + +Equipment +--------- + +Operating System +```````````````` + +All the modules of this protocol (Fig. 2) are designed to run on a command line (CLI) environment with a Linux OS (e.g., Ubuntu). +We recommend users install these modules and execute all commands on a high-performance Linux cluster or workstation with >32 CPUs +and 128GB of RAM instead of a laptop, as the assembly of raw reads has a high demand of CPU and RAM. + +Once users finish the data visualization module (Fig. 2), the resulting image files (PDF format) can be copied +to a desktop or laptop with GUI for data visualization. In practice, users can choose not to use our read processing +module and read mapping module. They may instead use their preferred tools for preparing input data for run_dbcan module +and for calculating abundance for CAZymes and substrates. In that case, they can skip the installation of our read processing +module and read mapping module in this protocol. + +Data Files +`````````` + +The example dataset (Carter2023) is described above and detailed in Table 2. +The raw read data, intermediate data from each analysis step, and final result +data and visualization files are organized in nested folders available on our +website https://bcb.unl.edu/dbCAN_tutorial/dataset1-Carter2023/, Fig. 5) and +https://dbcan.readthedocs.io. These websites also include data files and +protocols for two additional example datasets from :cite:`2021:Wastyk` and :cite:`2023:Priest`, +which are not included in this protocol paper. We will use the independent sample +assembly route for :cite:`2023:carter` in the main text to demonstrate all the commands. +Commands for the other routes are provided Supplementary Protocols. + +Software and versions +````````````````````` + +- **Anaconda** (`Anaconda `_, version 23.7.3) +- **MEGAHIT** (`MEGAHIT `_, version 1.2.9) +- **BWA** (`BWA `_, version 0.7.17-r1188) +- **HMMER** (`HMMER `_, version 3.3) +- **DIAMOND** (`DIAMOND `_, version 2.1.8) +- **BLAST** (`BLAST `_, version 2.14) +- **TrimGalore** (`TrimGalore `_, version 0.6.0) +- **Prokka** (`Prokka `_, version 1.4) +- **Samtools** (`Samtools `_, version 1.7) +- **Seqkit** (`Seqkit `_, version 2.5.1) +- **Bedtools** (`Bedtools `_, version 2.27.1) +- **Kraken2** (`Kraken2 `_, version 2.1.1) +- **run_dbcan** (`run_dbcan `_, version 4.0.0) +- **BBTools** (`BBTools `_, version 37.62) +- **Seqkt** (`Seqkt `_, version 1.2-r94) +- **Minimap2** (`Minimap2 `_, version 2.26-r1175) +- **Flye** (`Flye `_, version 2.9.3-b1797) +- **Mmseqs2** (`Mmseqs2 `_, release 15-6f452) + +Anaconda as the Software Management System +`````````````````````````````````````````` +Anaconda will be used as the software package management system for this +protocol. Anaconda uses the ``conda`` command to create a virtual +environment to facilitate the easy installation of software packages +and running command line jobs. With the conda environment, users do +not need to worry about the potential issues of package dependencies +and version conflicts. + +Like in all bioinformatics data analysis tasks, we recommend users organize +their data files by creating a dedicated folder for each data analysis +step. + +.. _cater_2023: + +Example 1: Carter2023 Dataset :cite:`2023:carter` +------------------------------------------------- + +S1. Download Carter2023 (Table 2) raw reads (~10min) +````````````````````````````````````````````````````` +To download the required raw reads, use the following wget commands: .. code-block:: shell - conda create -n CAZyme_annotation python=3.9 - conda activate CAZyme_annotation + wget https://bcb.unl.edu/dbCAN_tutorial/dataset1-Carter2023/individual_assembly/Dry2014_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset1-Carter2023/individual_assembly/Dry2014_2.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset1-Carter2023/individual_assembly/Wet2014_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset1-Carter2023/individual_assembly/Wet2014_2.fastq.gz + +These raw data were originally downloaded from +https://www.ncbi.nlm.nih.gov/sra/?term=ERR7745896 +and https://www.ncbi.nlm.nih.gov/sra/?term=ERR7738162 +and renamed to indicate their collected seasons (Table 2). -3. Installing Bioinformatics Dependencies and dbCAN +S2. Install Anaconda (~3min) +```````````````````````````` -Install all necessary bioinformatics tools either with a single command or individually: +Download and install the latest version of Anaconda for Linux from +https://www.anaconda.com/download#downloads. Once Anaconda is +successfully installed, proceed to create a dedicated conda environment +named `CAZyme_annotation` and activate it. +Subsequently, all the required tools can be seamlessly installed within +this environment. .. code-block:: shell - conda install -f dbcan.configure + conda create -n CAZyme_annotation python=3.9 + conda activate CAZyme_annotation -Alternatively, install the tools one by one: +S3. Install all bioinformatics tools (~10min) +````````````````````````````````````````````` .. code-block:: shell @@ -73,47 +183,66 @@ Alternatively, install the tools one by one: conda install -c conda-forge -c bioconda mmseqs2 conda install dbcan -c conda-forge -c bioconda +Alternatively, users can run a single configuration file dbcan.yml +(replace S2 and S3) that streamlines the above +configuration of all the essential software required for this protocol. -4. Database Installation +.. code-block:: shell + + git clone https://github.com/linnabrown/run_dbcan.git + cd run_dbcan + conda env create -f dbcan.yml + conda activate CAZyme_annotation +S4. Configure databases required by run_dbcan (~2h) +```````````````````````````````````````````````````` To install the databases, execute the following commands: +.. include:: database_preparation.rst + +Download database required by Kraken2 (very slow; can be skipped +if users do not intend to run Kraken2): + .. code-block:: shell - test -d db || mkdir db - cd db \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/fam-substrate-mapping-08012023.tsv \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/PUL_12112023.faa && mv PUL_12112023.faa PUL.faa && makeblastdb -in PUL.faa -dbtype prot \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.xlsx \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL_12-12-2023.txt \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-PUL.tar.gz && tar xvf dbCAN-PUL.tar.gz \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/dbCAN_sub.hmm && hmmpress dbCAN_sub.hmm \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/CAZyDB.07262023.fa && diamond makedb --in CAZyDB.07262023.fa -d CAZy \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/dbCAN-HMMdb-V12.txt && mv dbCAN-HMMdb-V12.txt dbCAN.txt && hmmpress dbCAN.txt \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/tcdb.fa && diamond makedb --in tcdb.fa -d tcdb \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-1.hmm && hmmpress tf-1.hmm \ - && wget http://bcb.unl.edu/dbCAN2/download/Databases/V12/tf-2.hmm && hmmpress tf-2.hmm \ - && wget https://bcb.unl.edu/dbCAN2/download/Databases/V12/stp.hmm && hmmpress stp.hmm \ - && kraken2-build --standard --db K2 - -The downloaded files must be all in the right location (the db folder). -The CAZyDB.08062022.fa file is needed for DIAMOND search (Table 1). -The dbCAN-HMMdb-V11.txt and dbCAN_sub.hmm files are for HMMER search. -The tcdb.fa, tf-1.hmm, tf-2.hmm, and stp.hmm files are for CGC prediction. -The PUL.faa file consists of protein sequences from experimentally validated PULs for BLAST search to predict substrates for CGCs. -The dbCAN-PUL_07-01-2022.txt and dbCAN-PUL_07-01-2022.xlsx files contain PUL-substrate mapping curated from literature. -Lastly, the fam-substrate-mapping-08252022.tsv file is the family-EC-substrate mapping table for the prediction of CAZyme substrates. + kraken2-build --standard --db K2 -.. warning:: - The conda installation and configuration step may experience prolonged time while resolving environment dependencies. Users should be patient during this process. Alternatively, users consider "mamba", - another Python package manager that offers similar functionality to Anaconda. - Information and access to mamba software can be found at https://github.com/mamba-org/mamba. +**CRITICAL STEP** + + The downloaded files must be all in the right location (the db folder). + + The CAZyDB.07262023.fa file is needed for DIAMOND search (Table 1). + + The dbCAN-HMMdb-V12.txt and dbCAN_sub.hmm files are for HMMER search. + + The tcdb.fa, tf-1.hmm, tf-2.hmm, and stp.hmm files are for CGC prediction. + The PUL.faa file consists of protein sequences from experimentally + validated PULs for BLAST search to predict substrates for CGCs. + The dbCAN-PUL_12-12-2023.txt and dbCAN-PUL_12-12-2023.xlsx files contain + PUL-substrate mapping curated from literature. -Module 1: Reads processing to obtain contigs + Lastly, the + fam-substrate-mapping-08012023.tsv file is the family-EC-substrate + mapping table for the prediction of CAZyme substrates. + +.. warning:: + + Users should use a clean version of Anaconda. If the above steps failed, we suggest users reinstall their Anaconda. + The Anaconda installation and configuration step may experience + prolonged time while resolving environment dependencies. + Users should be patient during this process. Alternatively, + users may consider "mamba", another Python package manager + that offers similar functionality to Anaconda. Information and + access to mamba software can be found at + https://github.com/mamba-org/mamba. + +Procedure -------------------------------------------- +Module 1: Reads processing (Fig. 2) to obtain contigs +````````````````````````````````````````````````````` P1. Contamination Check ^^^^^^^^^^^^^^^^^^^^^^^ @@ -127,8 +256,9 @@ Use `kraken2` to check for contaminated reads: Kraken2 found very little contamination in the Carter2023 data. Consequently, there was no need for the contamination removal step. -If contamination is identified, users can align the reads to the reference genomes of potential -contamination source organisms to remove the aligned reads (Box 1). The most common source in human microbiome studies is from human hosts. +If contamination is identified, users can align the reads to the reference genomes of potential contamination source organisms to remove +the aligned reads (Box 1). The most common source in human microbiome studies is from human hosts. + Box 1: Removing Contamination Reads from Humans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -142,6 +272,7 @@ Box 1: Removing Contamination Reads from Humans -rw-rw-r-- 1 jinfang jinfang 5.1G Dec 12 09:47 Wet2014.kraken.output -rw-rw-r-- 1 jinfang jinfang 1.1M Dec 12 09:48 Wet2014.kreport + Suppose from these files, we have identified humans as the contamination source, we can use the following commands to remove the contamination reads by aligning reads to the human reference genome. .. code-block:: shell @@ -155,18 +286,17 @@ Box 1: Removing Contamination Reads from Humans samtools fastq -1 Wet2014_1.clean.fq.gz -2 Wet2014_2.clean.fq.gz Wet2014.hg38.unmap.bam samtools fastq -1 Dry2014_1.clean.fq.gz -2 Dry2014_2.clean.fq.gz Dry2014.hg38.unmap.bam -P2. Trimming Adapters and Low-Quality Reads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +P2. Trim adapter and low-quality reads (TIMING ~20min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell trim_galore --paired Wet2014_1.fastq.gz Wet2014_2.fastq.gz --illumina -j 36 trim_galore --paired Dry2014_1.fastq.gz Dry2014_2.fastq.gz --illumina -j 36 - -Trim_galore is used to trim adapters and low-quality reads. -We specified `--illumina` to indicate that the reads were generated using the Illumina sequencing platform. Nonetheless, trim_galore possesses the ability to automatically detect the adapter, -providing flexibility in adapter handling for users who may know the specific sequencing platform. +We specified `--illumina` to indicate that the reads were generated using the Illumina sequencing platform. +Nonetheless, trim_galore can automatically detect adapters, providing flexibility for users who may know the specific sequencing platform. Details of trimming are available in the trimming report file (Box 2). Box 2: Example output of `trim_galore` @@ -189,8 +319,8 @@ Box 2: Example output of `trim_galore` .. warning:: During the trimming process, certain reads may be entirely removed due to low quality in its entirety. - Using the --retain_unpaired parameter in trim_galore allows for the preservation of single-end reads. - In this protocol, this option was not select, so that both reads of a forward-revise pair were removed. + Using the ``--retain_unpaired`` parameter in ``trim_galore`` allows for the preservation of single-end reads. + In this protocol, this option was not selected, so that both reads of a forward-revise pair were removed. P3. Assemble reads into contigs ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -203,11 +333,11 @@ Use Megahit for assembling reads into contigs: megahit -m 0.5 -t 32 -o megahit_ Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000 -MEGAHIT generates two output folders. Each contains five files and one sub-folder (Box 3). -Wet2014.contigs.fa is the final contig sequence file. We set --min-contig-len 1000, +``MEGAHIT`` generates two output folders. Each contains five files and one sub-folder (Box 3). +``Wet2014.contigs.fa`` is the final contig sequence file. We set `--min-contig-len 1000`, a common practice to retain all contigs longer than 1,000 base pairs. -Box 3: Example output of `megahit` +Box 3: Example output of `MEGAHIT` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: shell @@ -217,18 +347,29 @@ Box 3: Example output of `megahit` drwxrwxr-x 2 jinfang jinfang 4.0K Dec 13 04:19 intermediate_contigs -rw-rw-r-- 1 jinfang jinfang 1.1K Dec 13 02:22 options.json -rw-rw-r-- 1 jinfang jinfang 258M Dec 13 04:19 Wet2014.contigs.fa + -rw-rw-r-- 1 jinfang jinfang 208K Dec 13 04:19 Wet2014.log + +.. warning:: + + A common practice in metagenomics after assembly is to further bin contigs into metagenome-assembled genomes (MAGs). + However, in this protocol, we chose not to generate MAGs because not all contigs can be binned into MAGs, and those un-binned + contigs can also encode CAZymes. -P4. Predict Genes with Prokka -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +P4. Predict genes by `Prokka` (TIMING ~21h) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - prokka --kingdom Bacteria --cpus 36 --outdir prokka_Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_Wet2014/Wet2014.contigs.fa - prokka --kingdom Bacteria --cpus 36 --outdir prokka_Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_Dry2014/Dry2014.contigs.fa + prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_ Wet2014/Wet2014.contigs.fa + prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_ Dry2014/Dry2014.contigs.fa + + +The parameter ``--kingdom Bacteria`` is required for bacterial gene prediction. +To optimize performance, ``--CPU 32`` instructs the utilization of 32 CPUs. +Reduce this number if you do not have this many CPUs on your computer. +The output files comprise of both protein and CDS sequences in Fasta format (e.g., ``Wet2014.faa`` and ``Wet2014.ffn`` in Box 4). -The parameter --kingdom Bacteria is required for bacterial gene prediction. -To optimize performance, --CPU 36 instructs the utilization of 36 computer processors. -The output files comprise of both protein and CDS sequences in Fasta format (e.g., Wet2014.faa and Wet2014.ffn in Box 4). Box 4: Example output of `Prokka` @@ -249,103 +390,75 @@ Box 4: Example output of `Prokka` -rw-rw-r-- 1 jinfang jinfang 30M Dec 13 21:38 Wet2014.tsv -rw-rw-r-- 1 jinfang jinfang 152 Dec 13 21:38 Wet2014.txt -Module 2. run_dbcan annotation to obtain CAZymes, CGCs, and substrates ----------------------------------------------------------------------- -P5. CAZyme annotation at family level (TIMING ~10min) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Module 2. run_dbcan annotation (Fig. 2) to obtain CAZymes, CGCs, and substrates +``````````````````````````````````````````````````````````````````````````````` + +**CRITICAL STEP** + +Users can skip P5 and P6, and directly run P7 (much slower though), if they want to predict not only CAZymes and CGCs, but also substrates. + +P5. CAZyme annotation at the CAZyme family level (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell run_dbcan prokka_Wet2014/Wet2014.faa protein --hmm_cpu 32 --out_dir Wet2014.CAZyme --tools hmmer --db_dir db run_dbcan prokka_Dry2014/Dry2014.faa protein --hmm_cpu 32 --out_dir Dry2014.CAZyme --tools hmmer --db_dir db -Two arguments are required for run_dbcan: the input sequence file (faa files) and the sequence type (protein). -By default, run_dbcan will use three methods (HMMER vs dbCAN HMMdb, DIAMOND vs CAZy, HMMER vs dbCAN-sub HMMdb) for CAZyme annotation (Table 1, Figure 2). -This default setting is equivalent to the use --tools all parameter (Box 5). -Here we only invoke the HMMER vs dbCAN HMMdb for CAZyme annotation at the family level. +Two arguments are required for ``run_dbcan``: the input sequence file (faa files) and the sequence type (protein). +By default, ``run_dbcan`` will use three methods (``HMMER`` vs ``dbCAN HMMdb``, ``DIAMOND`` vs ``CAZy``, ``HMMER`` vs ``dbCAN-sub HMMdb``) for +CAZyme annotation (see Table 1, Fig. 1). This default setting is equivalent to the use of the ``--tools all`` parameter (refer to Box 5). Here, +we only invoke the ``HMMER`` vs ``dbCAN HMMdb`` for CAZyme annotation at the family level. + Box 5: CAZyme annotation with default setting ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If the --tools parameter is not set, it is the default setting, which is the same as --tools all. -This will take much longer time to finish (~5h) due to the large size of dbCAN-sub HMMdb (used for substrate prediction for CAZymes, see Table 1). +If the ``--tools`` parameter is not set, it defaults to the equivalent of ``--tools all``. +This setting will take a much longer time to finish (approximately 5 hours) due to the large size of ``dbCAN-sub HMMdb`` +(used for substrate prediction for CAZymes, see Table 1). + .. code-block:: shell run_dbcan prokka_Wet2014/Wet2014.faa protein --out_dir Wet2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 --tools all run_dbcan prokka_Dry2014/Dry2014.faa protein --out_dir Dry2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 --tools all + The sequence type can be `protein`, `prok`, `meta`. If the input sequence file contains metagenomic contig sequences (`fna` file), -the sequence type has to be meta, and prodigal will be called to predict genes. +the sequence type has to be `meta`, and `prodigal` will be called to predict genes. .. code-block:: shell run_dbcan prokka_Wet2014/Wet2014.fna meta --out_dir Wet2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 run_dbcan prokka_Dry2014/Dry2014.fna meta --out_dir Dry2014.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 -5.1. Combine proteins from multiple samples - -.. warning:: - As shown in Figure 3 (step3), proteins from multiple samples can be combined to generate a non-redundant set of proteins. - This will reduce the runtime for the run_dbcan step (step4), as only one faa file will be processed. - However, this does not work for the CGC prediction, as contigs (fna files) from each sample will be needed. - Therefore, this step (5.1) is recommended if users only want the CAZyme annotation, and not recommended if CGCs are also to be predicted. - - -This protein sequence clustering step will create a mapping table with sequence cluster ID and protein IDs from each sample. - -.. code-block:: shell - - mkdir mmseqs_cluster && cd mmseqs_cluster - ln -s ../db . - cat ../prokka_Wet2014/Wet2014.faa ../prokka_Dry2014/Dry2014.faa > Dry_Wet.faa - mmseqs easy-cluster --threads 32 -c 0.95 --min-seq-id 0.95 --cov-mode 2 Dry_Wet.faa Dry_Wet_cluster tmp - mv Dry_Wet_cluster_cluster_rep.fasta Dry_Wet.cluster.faa - -This `Dry_Wet.cluster.faa` file now contains the non-redundant set of proteins from the two samples. - -.. code-block:: shell - - grep "^>" Dry_Wet.cluster.faa | tr ">" " " |awk '{print $1}' > Dry_Wet.geneids - seqkit grep -f Dry_Wet.geneids ../prokka_Dry2014/Wet2014.ffn > Dry_Wet.ffn - seqkit grep -f Dry_Wet.geneids ../prokka_Dry2014/Dry2014.ffn >> Dry_Wet.ffn - -This `Dry_Wet.ffn file` now contains the CDS sequences of the non-redundant set of proteins from the two samples. - -.. code-block:: shell - - bwa index Dry_Wet.ffn - ln -s ../Dry2014_1_val_1.fq.gz . && ln -s ../Dry2014_2_val_2.fq.gz . && ln -s ../Wet2014_2_val_2.fq.gz . && ln -s ../Wet2014_1_val_1.fq.gz . - bwa mem -t 32 -o samfiles/Wet2014.CDS.sam Dry_Wet.ffn Wet2014_1_val_1.fq.gz Wet2014 _2_val_2.fq.gz - bwa mem -t 32 -o samfiles/Dry2014.CDS.sam Dry_Wet.ffn Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz - -The two sam files now contain the read mapping result from each sample to the `Dry_Wet.ffn` file. - -P6. CGC prediction. -^^^^^^^^^^^^^^^^^^^ +P6. CGC prediction (TIMING ~15 min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The following commands will re-run run_dbcan to not only predict CAZymes but also CGCs with protein `faa` and gene location `gff` files. .. code-block:: shell run_dbcan prokka_Wet2014/Wet2014.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --out_dir Wet2014.PUL --dia_cpu 32 --hmm_cpu 32 - run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_ Dry2014/Dry2014.gff --out_dir Dry2014.PUL --dia_cpu 32 --hmm_cpu 32 + run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --out_dir Dry2014.PUL --dia_cpu 32 --hmm_cpu 32 + + +As mentioned above (see Table 1, Fig. 1), CGC prediction is a featured function added into dbCAN2 in 2018. +To identify CGCs with the protein sequence type, a gene location file (``gff``) must be provided together. If the input sequence type +is ``prok`` or ``meta``, meaning users only have contig ``fna`` files, the CGC prediction can be activated by setting the ``-c cluster`` parameter. -As mentioned above (Table 1, Figure 2), -CGC prediction is a featured function added into dbCAN2 in 2018. -To identify CGCs with the protein sequence type, -a gene location file (gff) must be provided together. -If the input sequence type is prok or meta, meaning users only have contig fna files, the CGC prediction can be activated by setting -c cluster. .. warning:: **Creating own gff file** - If the users would like to create their own gff file (instead of using Prokka or Prodigal), - it is important to make sure the value of ID attribute in the gff file matches the protein ID in the protein faa file. + If the users would like to create their own ``gff`` file (instead of using Prokka or Prodigal), + it is important to make sure the value of ID attribute in the ``gff`` file matches the protein ID in the protein ``faa`` file. - **CGC not found** - If no result is found in CGC output file, it is most likely because the sequence IDs in gff file and faa file do not match. Another less likely reason is that the contigs are too short and fragmented and not suitable for CGC prediction. + **[Troubleshooting]CGC not found** + If no result is found in CGC output file, it is most likely because the sequence IDs in ``gff`` file and ``faa`` file do not match. + Another less likely reason is that the contigs are too short and fragmented and not suitable for CGC prediction. P7. Substrate prediction for CAZymes and CGCs (TIMING ~5h) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -355,13 +468,13 @@ The following commands will re-run run_dbcan to predict CAZymes, CGCs, and their .. code-block:: shell run_dbcan prokka_Wet2014/Wet2014.faa protein --dbcan_thread 32 --tf_cpu 32 --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --cgc_substrate --hmm_cpu 32 --out_dir Wet2014.dbCAN --dia_cpu 32 - run_dbcan prokka_Dry2014/Dry2014.faa protein --dbcan_thread 32 --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --out_dir Dry2014.dbCAN --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + run_dbcan prokka_Dry2014/Dry2014.faa protein --dbcan_thread 32 --tf_cpu 32 --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --hmm_cpu 32 --out_dir Dry2014.dbCAN --dia_cpu 32 .. warning:: - The above commands do not set the --tools parameter, + The above commands do not set the `--tools` parameter, which means all three methods for CAZyme annotation will be activated (Box 5). Because dbCAN-sub HMMdb (for CAZyme substrate prediction) is 200 times larger than dbCAN HMMdb, - the runtime will be much longer. Users can specify --tools hmmer, so that the HMMER search against dbCAN-sub will be disabled. + the runtime will be much longer. Users can specify `--tools hmmer`, so that the HMMER search against dbCAN-sub will be disabled. However, this will turn off the substrate prediction for CAZymes and CGCs based on CAZyme substrate majority voting. Consequently, the substrate prediction will be solely based on homology search against PULs in dbCAN-PUL @@ -370,70 +483,96 @@ The following commands will re-run run_dbcan to predict CAZymes, CGCs, and their run_dbcan prokka_Wet2014/Wet2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --cgc_substrate --out_dir Wet2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --out_dir Dry2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 -.. warning:: - The above commands do not set the --tools parameter, which means all three methods for CAZyme annotation will be activated (Box 5). - Because dbCAN-sub HMMdb (for CAZyme substrate prediction) is 200 times larger than dbCAN HMMdb, the runtime will be much longer. - Users can specify --tools hmmer, so that the HMMER search against dbCAN-sub will be disabled. - However, this will turn off the substrate prediction for CAZymes and CGCs based on CAZyme substrate majority voting. - Consequently, the substrate prediction will be solely based on homology search against PULs in dbCAN-PUL (Figure 1, Table 1). - - .. code-block:: shell - - run_dbcan prokka_Wet2014/Wet2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Wet2014/Wet2014.gff --cgc_substrate --out_dir Wet2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 - run_dbcan prokka_Dry2014/Dry2014.faa protein --tools hmmer --stp_cpu 32 -c prokka_Dry2014/Dry2014.gff --cgc_substrate --out_dir Dry2014.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 - - -Box 6. Example Output Folder Content of run_dbcan Substrate Prediction +Box 6. Example output folder content of run_dbcan substrate prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The output directory of run_dbcan substrate prediction typically contains 17 files and 1 folder: + In the output directory (`Output Directory `_), + a total of 17 files and 1 folder are generated: + .. code-block:: shell - -rw-rw-r-- 1 jinfang jinfang 33M Dec 17 09:36 blastp.out - -rw-rw-r-- 1 jinfang jinfang 3.3M Dec 17 09:35 CAZyme.pep + -rw-rw-r-- 1 jinfang jinfang 33M Dec 17 09:36 PUL_blast.out + -rw-rw-r-- 1 jinfang jinfang 3.3M Dec 17 09:35 CGC.faa -rw-rw-r-- 1 jinfang jinfang 18M Dec 17 09:35 cgc.gff -rw-rw-r-- 1 jinfang jinfang 836K Dec 17 09:35 cgc.out -rw-rw-r-- 1 jinfang jinfang 374K Dec 17 09:35 cgc_standard.out -rw-rw-r-- 1 jinfang jinfang 1.8M Dec 17 09:35 cgc_standard.out.json - -rw-rw-r-- 1 jinfang jinfang 785K Dec 17 09:31 dbsub.out + -rw-rw-r-- 1 jinfang jinfang 785K Dec 17 09:31 dbcan-sub.hmm.out -rw-rw-r-- 1 jinfang jinfang 511K Dec 17 09:31 diamond.out -rw-rw-r-- 1 jinfang jinfang 638K Dec 17 09:31 dtemp.out -rw-rw-r-- 1 jinfang jinfang 414K Dec 17 09:31 hmmer.out -rw-rw-r-- 1 jinfang jinfang 386K Dec 17 09:35 overview.txt -rw-rw-r-- 1 jinfang jinfang 2.8M Dec 17 09:35 stp.out - -rw-rw-r-- 1 jinfang jinfang 63K Dec 17 09:36 sub.prediction.out - drwxrwxr-x 2 jinfang jinfang 36K Dec 17 09:39 syntenic.svg + -rw-rw-r-- 1 jinfang jinfang 63K Dec 17 09:36 substrate.out + drwxrwxr-x 2 jinfang jinfang 36K Dec 17 09:39 synteny.pdf -rw-rw-r-- 1 jinfang jinfang 799K Dec 17 09:32 tf-1.out -rw-rw-r-- 1 jinfang jinfang 645K Dec 17 09:34 tf-2.out -rw-rw-r-- 1 jinfang jinfang 2.3M Dec 17 09:35 tp.out -rw-rw-r-- 1 jinfang jinfang 75M Dec 17 02:07 uniInput - Descriptions of Key Output Files: - - `blastp.out`: BLAST results between CGCs and PULs. - - `CAZyme.pep`: Fasta sequences of CAZymes. - - `cgc.gff`: Reformatted user input GFF file, marking CAZymes, TFs, TCs, and STPs. - - `cgc.out`: Raw output of CGC predictions. - - `cgc_standard.out`: Simplified version of `cgc.out` in TSV format for easy parsing (refer to Box 7 for columns). - - `cgc_standard.out.json`: JSON format of `cgc_standard.out`. - - `dbsub.out`: HMMER search result against dbCAN-sub HMMdb, with CAZyme substrates extracted from fam-substrate-mapping-08252022.tsv. - - `diamond.out`: DIAMOND search result against the CAZy annotated protein sequences (CAZyDB.08062022.fa). - - `dtemp.out`: Temporary file. - - `hmmer.out`: HMMER search result against dbCAN HMMdb. - - `overview.txt`: Summary of CAZyme annotation from three methods in TSV format (refer to Box 7 for columns). - - `stp.out`: HMMER search result against the MiST65 compiled signal transduction protein HMMs from Pfam. - - `tf-1.out` and `tf-2.out`: HMMER search results against transcription factor HMMs from Pfam and Superfamily databases. - - `tp.out`: DIAMOND search result against the TCDB annotated protein sequences. - - `sub.prediction.out`: Summary of substrate prediction results (refer to Box 7) for CGCs. - - `syntenic.svg`: Syntenic block alignment plots between all CGCs and PULs. - - `uniInput`: Renamed Fasta file from input protein sequence file. + Descriptions of Output Files: + In the output directory, a total of 17 files and 1 folder are generated: + + - ``PUL_blast.out``: BLAST results between CGCs and PULs. + - ``CGC.faa``: Protein Fasta sequences encoded in all CGCs. + - ``cgc.gff``: Reformatted from the user input gff file by marking CAZymes, TFs, TCs, and STPs. + - ``cgc.out``: Raw output of CGC predictions. + - ``cgc_standard.out``: Simplified version of cgc.out for easy parsing in TSV format. Example columns include: + + 1. ``CGC_id``: CGC1 + 2. ``type``: CAZyme + 3. ``contig_id``: k141_272079 + 4. ``gene_id``: Wet2014_00308 + 5. ``start``: 5827 + 6. ``end``: 7257 + 7. ``strand``: - + 8. ``annotation``: GH1 + + **Explanation**: The gene Wet2014_00308 encodes a GH1 CAZyme in CGC1 of contig k141_272079. CGC1 also contains other genes, detailed in other rows. Wet2014_00308 is located on the negative strand of k141_272079 from positions 5827 to 7257. The type can be one of four signature gene types (CAZymes, TCs, TFs, STPs) or null type (not annotated as one of the signature genes). + + - ``cgc_standard.out.json``: JSON format of cgc_standard.out. + - ``dbcan-sub.hmm.out``: HMMER search result against dbCAN-sub HMMdb, including a column with CAZyme substrates from fam-substrate-mapping-08012023.tsv. + - ``diamond.out``: DIAMOND search result against the CAZy annotated protein sequences (CAZyDB.07262023.fa). + - ``dtemp.out``: Temporary file. + - ``hmmer.out``: HMMER search result against dbCAN HMMdb. + - ``overview.txt``: Summary of CAZyme annotation from three methods in TSV format. Example columns include: + + 1. ``Gene_ID``: Wet2014_00076 + 2. ``EC#``: 3.2.1.99:3 + 3. ``dbCAN``: GH43_4(42-453) + 4. ``dbCAN_sub``: GH43_e149 + 5. ``DIAMOND``: GH43_4 + 6. ``#ofTools``: 3 + + **Explanation**: The protein Wet2014_000076 is annotated by three tools as a CAZyme: GH43_4 (CAZy defined subfamily 4 of GH43) by HMMER vs dbCAN HMMdb, GH43_e149 (eCAMI defined subfamily e149; 'e' indicates it is from eCAMI not CAZy) by HMMER vs dbCAN-sub HMMdb, and GH43_4 by DIAMOND vs CAZy annotated protein sequences. The EC number is extracted from eCAMI, indicating that the eCAMI subfamily GH43_e149 contains 3 member proteins with an EC 3.2.1.99 according to CAZy. The preference order for different assignments is dbCAN > eCAMI/dbCAN-sub > DIAMOND. Refer to dbCAN2 paper11, dbCAN3 paper12, and eCAMI41 for more details. + + **Note**: If the ``--use_signalP`` parameter was invoked when running run_dbcan, an additional column called ``signalP`` will be in overview.txt. + + - ``stp.out``: HMMER search result against the MiST70 compiled signal transduction protein HMMs from Pfam. + - ``tf-1.out``: HMMER search result against the DBD71 compiled transcription factor HMMs from Pfam72. + - ``tf-2.out``: HMMER search result against the DBD compiled transcription factor HMMs from Superfamily73. + - ``tp.out``: DIAMOND search result against the TCDB 74 annotated protein sequences. + - ``substrate.out``: Summary of substrate prediction results for CGCs in TSV format from two approaches12 (dbCAN-PUL blast search and dbCAN-sub majority voting). Example columns include: + + 1. ``CGC_ID``: k141_227425|CGC1 + 2. ``Best hit PUL_ID in dbCAN-PUL``: PUL0402 + 3. ``Substrate of the hit PUL``: xylan + 4. ``Sum of bitscores for homologous gene pairs between CGC and PUL``: 2134.0 + 5. ``Types of homologous gene pairs``: TC-TC;CAZyme-CAZyme + 6. ``Substrate predicted by majority voting of CAZymes in CGC``: xylan + 7. ``Voting score``: 2.0 + *Explanation*: The CGC1 of contig k141_227425 has its best hit PUL0402 (from PUL_blast.out) with xylan as substrate (from dbCAN-PUL_12-12-2023.xlsx). Two signature genes are matched between k141_227425|CGC1 and PUL0402: one is a CAZyme and the other is a TC. The sum of blast bit scores of the two homologous pairs (TC-TC and CAZyme-CAZyme) is 2134.0. Hence, the substrate of k141_227425|CGC1 is predicted to be xylan according to dbCAN-PUL blast search. The last two columns are based on the dbCAN-sub result (dbcan-sub.hmm.out), as the file indicates that two CAZymes in k141_227425|CGC1 are predicted to have xylan substrate. The voting score is 2.0, so according to the majority voting rule, k141_227425|CGC1 is predicted to have a xylan substrate. + *Note*: For many CGCs, only one of the two approaches produces substrate prediction. In some cases, the two approaches produce different substrate assignments. The recommended preference order is dbCAN-PUL blast search > dbCAN-sub majority voting. Refer to dbCAN3 paper12 for more details. -Module 3. Read mapping (Figure 3) to calculate abundance for CAZyme families, subfamilies, CGCs, and substrates ---------------------------------------------------------------------------------------------------------------- + - ``synteny.pdf``: A folder with syntenic block alignment plots between all CGCs and PULs. + - ``uniInput``: Renamed Fasta file from input protein sequence file. +Module 3. Read mapping (Fig. 2) to calculate abundance for CAZyme families, subfamilies, CGCs, and substrates +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` P8. Read mapping to all CDS of each sample (TIMING ~20 min) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -442,10 +581,11 @@ P8. Read mapping to all CDS of each sample (TIMING ~20 min) bwa index prokka_Wet2014/Wet2014.ffn bwa index prokka_Dry2014/Dry2014.ffn mkdir samfiles - bwa mem -t 32 -o samfiles/Wet2014.CDS.sam prokka_Wet2014/Wet2014.ffn Wet2014_1_val_1.fq.gz Wet2014 _2_val_2.fq.gz + bwa mem -t 32 -o samfiles/Wet2014.CDS.sam prokka_Wet2014/Wet2014.ffn Wet2014_1_val_1.fq.gz Wet2014_2_val_2.fq.gz bwa mem -t 32 -o samfiles/Dry2014.CDS.sam prokka_Dry2014/Dry2014.ffn Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz -Reads are mapped to the ffn files from Prokka. + +Reads are mapped to the ``ffn`` files from Prokka. P9. Read mapping to all contigs of each sample (TIMING ~20min) @@ -453,147 +593,202 @@ P9. Read mapping to all contigs of each sample (TIMING ~20min) .. code-block:: shell - $ bwa index megahit_Wet2014/Wet2014.contigs.fa - $ bwa index megahit_Dry2014/Dry2014.contigs.fa - $ bwa mem -t 32 -o samfiles/Wet2014.sam megahit_Wet2014/Wet2014.contigs.fa Wet2014_1_val_1.fq.gz Wet2014_2_val_2.fq.gz - $ bwa mem -t 32 -o samfiles/Dry2014.sam megahit_Dry2014/Dry2014.contigs.fa Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz + bwa index megahit_Wet2014/Wet2014.contigs.fa + bwa index megahit_Dry2014/Dry2014.contigs.fa + bwa mem -t 32 -o samfiles/Wet2014.sam megahit_Wet2014/Wet2014.contigs.fa Wet2014_1_val_1.fq.gz Wet2014_2_val_2.fq.gz + bwa mem -t 32 -o samfiles/Dry2014.sam megahit_Dry2014/Dry2014.contigs.fa Dry2014_1_val_1.fq.gz Dry2014_2_val_2.fq.gz -Reads are mapped to the contig files from MEGAHIT. +Reads are mapped to the `contig` files from MEGAHIT. P10. Sort SAM files by coordinates (TIMING ~8min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ cd samfiles - $ samtools sort -@ 32 -o Wet2014.CDS.bam Wet2014.CDS.sam - $ samtools sort -@ 32 -o Dry2014.CDS.bam Dry2014.CDS.sam - $ samtools sort -@ 32 -o Wet2014.bam Wet2014.sam - $ samtools sort -@ 32 -o Dry2014.bam Dry2014.sam - $ rm -rf *sam - $ cd .. + cd samfiles + samtools sort -@ 32 -o Wet2014.CDS.bam Wet2014.CDS.sam + samtools sort -@ 32 -o Dry2014.CDS.bam Dry2014.CDS.sam + samtools sort -@ 32 -o Wet2014.bam Wet2014.sam + samtools sort -@ 32 -o Dry2014.bam Dry2014.sam + rm -rf *sam + cd .. + P11. Read count calculation for all proteins of each sample using Bedtools (TIMING ~2min) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ mkdir Wet2014_abund && cd Wet2014_abund - $ seqkit fx2tab -l -n -i ../prokka_Wet2014/Wet2014.ffn | awk '{print $1"\t"$2}' > Wet2014.length - $ seqkit fx2tab -l -n -i ../prokka_Wet2014/Wet2014.ffn | awk '{print $1"\t"0"\t"$2}' > Wet2014.bed - $ bedtools coverage -g Wet2014.length -sorted -a Wet2014.bed -counts -b ../samfiles/Wet2014.CDS.bam > Wet2014.depth.txt + mkdir Wet2014_abund && cd Wet2014_abund + seqkit fx2tab -l -n -i ../prokka_Wet2014/Wet2014.ffn | awk '{print $1"\t"$2}' > Wet2014.length + seqkit fx2tab -l -n -i ../prokka_Wet2014/Wet2014.ffn | awk '{print $1"\t"0"\t"$2}' > Wet2014.bed + bedtools coverage -g Wet2014.length -sorted -a Wet2014.bed -counts -b ../samfiles/Wet2014.CDS.bam > Wet2014.depth.txt - $ cd .. && mkdir Dry2014_abund && cd Dry2014_abund - $ seqkit fx2tab -l -n -i ../prokka_Dry2014/Dry2014.ffn | awk '{print $1"\t"$2}' > Dry2014.length - $ seqkit fx2tab -l -n -i ../prokka_Dry2014/Dry2014.ffn | awk '{print $1"\t"0"\t"$2}' > Dry2014.bed - $ bedtools coverage -g Dry2014.length -sorted -a Dry2014.bed -counts -b ../samfiles/Dry2014.CDS.bam > Dry2014.depth.txt - $ cd .. + cd .. && mkdir Dry2014_abund && cd Dry2014_abund + seqkit fx2tab -l -n -i ../prokka_Dry2014/Dry2014.ffn | awk '{print $1"\t"$2}' > Dry2014.length + seqkit fx2tab -l -n -i ../prokka_Dry2014/Dry2014.ffn | awk '{print $1"\t"0"\t"$2}' > Dry2014.bed + bedtools coverage -g Dry2014.length -sorted -a Dry2014.bed -counts -b ../samfiles/Dry2014.CDS.bam > Dry2014.depth.txt + cd .. -Read counts are saved in depth.txt files of each sample. + +Read counts are saved in ``depth.txt`` files of each sample. P12. Read count calculation for a given region of contigs using Samtools (TIMING ~2min) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ cd Wet2014_abund - $ samtools index ../samfiles/Wet2014.bam - $ samtools depth -r k141_41392:152403-165349 ../samfiles/Wet2014.bam > Wet2014.cgc.depth.txt - $ cd .. - $ cd Dry2014_abund - $ samtools index ../samfiles/Dry2014.bam - $ samtools depth -r k141_41392:152403-165349 ../samfiles/Dry2014.bam > Dry2014.cgc.depth.txt + cd Wet2014_abund + samtools index ../samfiles/Wet2014.bam + samtools depth -r k141_41392:152403-165349 ../samfiles/Wet2014.bam > Wet2014.cgc.depth.txt + cd .. + -The parameter -r k141_41392:152403-165349 specifies a region in a contig. For any CGC, its positional range can be found in the file cgc_standard.out produced by run_dbcan (Box 6). The depth.txt files contain the raw read counts for the specified region. +The parameter ``-r k141_41392:152403-165349`` specifies a region in a contig. For any CGC, its positional range can be found in the file ``cgc_standard.out`` produced by ``run_dbcan`` (refer to Box 6). The ``depth.txt`` files contain the raw read counts for the specified region. + +.. warning:: + The contig IDs are automatically generated by MEGAHIT. There is a small chance that the same contig ID appears in both samples. However, the two contigs in the two samples do not match each other even if the ID is the same. For example, the contig ID ``k141_4139`` is most likely only found in the Wet2014 sample. Even if there is a ``k141_41392`` in Dry2014, the actual contigs in the two samples are different. P13. dbcan_utils to calculate the abundance of CAZyme families, subfamilies, CGCs, and substrates (TIMING ~1min) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ dbcan_utils CAZyme_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM - $ dbcan_utils CAZymeSub_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM - $ dbcan_utils PUL_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM - $ dbcan_utils PULSub_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + dbcan_utils fam_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + dbcan_utils fam_substrate_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + dbcan_utils CGC_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM + dbcan_utils CGC_substrate_abund -bt Wet2014.depth.txt -i ../Wet2014.dbCAN -a TPM - $ cd .. && cd Dry2014_abund - $ dbcan_utils CAZyme_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM - $ dbcan_utils CAZymeSub_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM - $ dbcan_utils PUL_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM - $ dbcan_utils PULSub_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + cd .. && cd Dry2014_abund + dbcan_utils fam_abundfam_substrate_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + dbcan_utils fam_substrate_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + dbcan_utils CGC_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM + dbcan_utils CGC_substrate_abund -bt Dry2014.depth.txt -i ../Dry2014.dbCAN -a TPM cd .. -We developed a set of Python scripts as dbcan_utils to take the raw read counts for all CDS as input and output the normalized abundances (Box 8) of CAZyme families, subfamilies, CGCs, and substrates (Figure 4). The parameter -a TPM can also be two other metrics: RPM, or FPKM. -Box 8. Example output of dbcan_utils -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We developed a set of Python scripts as ``dbcan_utils`` (included in the ``run_dbcan`` package) to take the raw read counts for all genes as input and output the normalized abundances (refer to Box 7) of CAZyme families, subfamilies, CGCs, and substrates (see Fig. 4). The parameter ``-a TPM`` can also be set to two other metrics: RPM, or RPKM61. - Executing these commands will yield five distinct files for each sample: CAZyme_abund_output, - PUL_abund_output, CAZymeSub_abund_output, PULSub_abund_output.major_voting, and PULSub_abund_output.homo. - These files encompass the abundance of CAZyme, CGC, and substrates within the respective samples, - providing detailed information. Notably, users can conveniently trace back the abundance of these entities. - The abundance calculations adhere to the TPM definition. +- **RPKM** is calculated as the number of mapped reads to a gene G divided by [(total number of mapped reads to all genes / 10^6) x (gene G length / 1000)]. +- **RPM** is the number of mapped reads to a gene G divided by (total number of mapped reads to all genes / 10^6). +- **TPM** is calculated as [number of mapped reads to a gene G / (gene G length / 1000)] divided by the sum of [number of mapped reads to each gene / (the gene length / 1000)]. -Module 4: dbcan_plot for data visualization (Figure 3) of abundances of CAZymes, CGCs, and substrates (TIMING variable) ------------------------------------------------------------------------------------------------------------------------ -To visualize the CAZyme annotation result, we provide a set of Python scripts as dbcan_plot to make publication quality plots with the dbcan_utils results as the input. The dbcan_plot scripts can be installed with commands as follows: +Box 7. Example output of dbcan_utils +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +As an example, the Wet2014_abund folder (https://bcb.unl.edu/dbCAN_tutorial/dataset1-Carter2023/individual_assembly/Wet2014_abund/) has 7 TSV files: .. code-block:: shell - $ python3 setup.py install + -rw-rw-r-- 1 jinfang jinfang 201106 Dec 31 01:58 CGC_abund.out + -rw-rw-r-- 1 jinfang jinfang 2204 Dec 31 01:58 CGC_substrate_majority_voting.out + -rw-rw-r-- 1 jinfang jinfang 16282 Dec 31 01:58 CGC_substrate_PUL_homology.out + -rw-rw-r-- 1 jinfang jinfang 2695 Dec 31 01:58 EC_abund.out + -rw-rw-r-- 1 jinfang jinfang 3949 Dec 31 01:58 fam_abund.out + -rw-rw-r-- 1 jinfang jinfang 44138 Dec 31 01:58 fam_substrate_abund.out + -rw-rw-r-- 1 jinfang jinfang 27314 Dec 31 01:58 subfam_abund.out + -rw-rw-r-- 1 jinfang jinfang 270535 Dec 31 02:43 Wet2014.cgc.depth.txt + +Explanation of columns in these TSV files is as follows: + + - ``fam_abund.out``: CAZy family (from HMMER vs dbCAN HMMdb), sum of TPM, number of CAZymes in the family. + - ``subfam_abund.out``: eCAMI subfamily (from HMMER vs dbCAN-sub HMMdb), sum of TPM, number of CAZymes in the subfamily. + - ``EC_abund.out``: EC number (extracted from dbCAN-sub subfamily), sum of TPM, number of CAZymes with the EC. + - ``fam_substrate_abund.out``: Substrate (from HMMER vs dbCAN-sub HMMdb), sum of TPM (all CAZymes in this substrate group), GeneID (all CAZyme IDs in this substrate group). + - ``CGC_abund.out``: CGC_ID (e.g., k141_338400|CGC1), mean of TPM (all genes in the CGC), Seq_IDs (IDs of all genes in the CGC), TPM (of all genes in the CGC), Families (CAZyme family or other signature gene type of all genes in the CGC). + - ``CGC_substrate_PUL_homology.out``: Substrate (from dbCAN-PUL blast search), sum of TPM, CGC_IDs (all CGCs predicted to have the substrate from dbCAN-PUL blast search), TPM (of CGCs in this substrate group). + - ``CGC_substrate_majority_voting.out``: Substrate (from dbCAN-sub majority voting), sum of TPM, CGC_IDs (all CGCs predicted to have the substrate from dbCAN-sub majority voting), TPM (of CGCs in this substrate group). + +.. warning:: + As shown in Fig. 2 (step3), proteins from multiple samples can be combined to generate a non-redundant set of proteins (Box 8). This may reduce the runtime for the run_dbcan step (step4), as only one faa file will be processed. However, this does not work for the CGC prediction, as contigs (fna files) from each sample will be needed. Therefore, this step is recommended if users only want the CAZyme annotation, and not recommended if CGCs are also to be predicted. + +Module 4: dbcan_plot for data visualization (Fig. 2) of abundances of CAZymes, CGCs, and substrates (TIMING variable) +````````````````````````````````````````````````````````````````````````````````````````````````````````````````````` +**CRITICAL STEP** -In addition to the two abundance folders Wet2014_abund and Dry2014_abund, the two CAZyme annotation folders Wet2014.dbCAN and Dry2014.dbCAN, are also needed well as two abundance folders Wet2014_abund. +To visualize the CAZyme annotation result, we provide a set of Python scripts as ``dbcan_plot`` to make publication-quality plots with the ``dbcan_utils`` results as the input. The ``dbcan_plot`` scripts are included in the ``run_dbcan`` package. Once the plots are made in PDF format, they can be transferred to users' Windows or Mac computers for visualization. -P14. Heatmap for CAZyme substrate abundance across samples (Figure 6A) (TIMING ~xx) +Five data folders will be needed as the input for ``dbcan_plot``: + +1. Two abundance folders: ``Wet2014_abund`` and ``Dry2014_abund``. +2. Two CAZyme annotation folders: ``Wet2014.dbCAN`` and ``Dry2014.dbCAN``. +3. The ``dbCAN-PUL`` folder (located under the db folder, released from ``dbCAN-PUL.tar.gz``). + + + +P14. Heatmap for CAZyme substrate abundance across samples (Fig. 6A) (TIMING 1min) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ dbcan_plot heatmap_plot --samples Wet2014,Dry2014 -i Wet2014_abund/CAZymeSub_abund_output,Dry2014_abund/CAZymeSub_abund_output --show_abund --top 20 + dbcan_plot heatmap_plot --samples Wet2014,Dry2014 -i Wet2014_abund/fam_substrate_abund.out, Dry2014_abund/fam_substrate_abund.out --show_abund --top 20 -Here we plot the top 20 substrates in the two samples. -The input files are the two CAZyme substrate abundance files calculated based on dbCAN-sub result. -The default heatmap is ranked by substrate abundances. -To rank the heatmap according to abundance profile using the xxx clustering algorithm, -users can invoke the `--cluster_map` parameter. +Here we plot the top 20 substrates in the two samples (Fig. 6A). The input files are the two CAZyme substrate abundance files calculated based on +dbCAN-sub result. The default heatmap is ranked by substrate abundances. To rank the heatmap according to abundance profile using +the clustermap function of the seaborn package (https://github.com/mwaskom/seaborn), users can invoke the ``--cluster_map`` parameter. -P15. Barplot for CAZyme substrate abundance across samples (Figure 6B) (TIMING ~xx) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +P15. Barplot for CAZyme family/subfamily/EC abundance across samples (Fig. 6B,C) (TIMING 1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ dbcan_plot bar_plot --samples Wet2014,Dry2014 --vertical_bar --top 20 -i Wet2014_abund/CAZyme_abund_output,Dry2014_abund/CAZyme_abund_output + dbcan_plot bar_plot --samples Wet2014,Dry2014 --vertical_bar --top 20 -i Wet2014_abund/fam_abund.out,Dry2014_abund/fam_abund.out + dbcan_plot bar_plot --samples Wet2014,Dry2014 --vertical_bar --top 20 -i Wet2014_abund/subfam_abund.out,Dry2014_abund/subfam_abund.out -Users can choose to generate a barplot instead of heatmap using the bar_plot method. +Users can choose to generate a barplot instead of heatmap using the bar_plot method. -P16. Synteny plot between a CGC and its best PUL hit with read mapping coverage to CGC (Figure 6C) (TIMING ~xx) +P16. Synteny plot between a CGC and its best PUL hit with read mapping coverage to CGC (Fig. 6D) (TIMING 1min) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: shell - $ dbcan_plot CGC_syntenic_with_PUL_abund -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' --readscount Wet2014_abund/Wet2014.cgc.depth.txt + dbcan_plot CGC_synteny_coverage_plot -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' --readscount Wet2014_abund/Wet2014.cgc.depth.txt -The Wet2014.dbCAN folder contains the PUL.out file. Using this file, the cgc_standard.out file, and the best PUL’s gff file in dbCAN-PUL.tar.gz, the CGC_synteny_plot method will create the CGC-PUL synteny plot. The –cgcid parameter is required to specify which CGC to be plotted (‘k141_41392|CGC3' in this example). The Wet2014.cgc.depth.txt file is used to plot the read mapping coverage. +The Wet2014.dbCAN folder contains the PUL.out file. Using this file, the cgc_standard.out file, and the best PUL's gff file in dbCAN-PUL.tar.gz, the CGC_synteny_plot method will create the CGC-PUL synteny plot. The –cgcid parameter is required to specify which CGC to plot (k141_41392|CGC3 in this example). The Wet2014.cgc.depth.txt file is used to plot the read mapping coverage. If users only want to plot the CGC structure: .. code-block:: shell - $ dbcan_plot CGC -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' + dbcan_plot CGC_plot -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' If users only want to plot the CGC structure plus the read mapping coverage: .. code-block:: shell - $ dbcan_plot CGC_abund -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' --readscount Wet2014_abund/Wet2014.cgc.depth.txt + dbcan_plot CGC_coverage_plot -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' --readscount Wet2014_abund/Wet2014.cgc.depth.txt If users only want to plot the synteny between the CGC and PUL: .. code-block:: shell - $ dbcan_plot CGC_syntenic_with_PUL -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' + dbcan_plot CGC_synteny_plot -i Wet2014.dbCAN --cgcid 'k141_41392|CGC3' .. warning:: - The CGC IDs in different samples do not match each other. For example, specifying -i Wet2014.dbCAN is to plot the `k141_41392|CGC3`` in the Wet2014 sample. The `k141_41392|CGC3`` in the Dry2014 sample will be different. + The CGC IDs in different samples do not match each other. For example, specifying -i Wet2014.dbCAN is to plot the 'k141_41392|CGC3' in the Wet2014 sample. The 'k141_41392|CGC3' in the Dry2014 sample most likely does not exist, and even it does, the CGC has a different sequence even if the ID is the same. + + +Troubleshooting +--------------- + +We provide Table 3 to list possible issues and solutions. Users can also post issues on run_dbcan GitHub site. + +Procedure Timing +---------------- + +The estimated time for completing each step of the protocol on the Carter2023 dataset is as follows: + +- **Step P1. Contamination checking**: Approximately 10 minutes. +- **Step P2. Raw reads processing**: Approximately 20 minutes. +- **Step P3. Metagenome assembly**: Approximately 4 hours and 20 minutes. +- **Step P4. Gene models prediction**: Approximately 21 hours. +- **Step P5. CAZyme annotation**: Approximately 10 minutes. +- **Step P6. PUL prediction**: Approximately 15 minutes. +- **Step P7. Substrate prediction for CAZyme and PUL**: Approximately 5 hours. +- **Step P8-P12. Reads mapping**: Approximately 52 minutes. +- **Step P13. Abundance estimation**: Approximately 1 minute. +- **Step P14-P16. Data visualization**: Approximately 3 minutes. + +Running this protocol on a Linux computer with 40 CPUs and 128GB of RAM is estimated to take approximately 33 hours. The most time-consuming step is P4 (Prokka gene prediction), which can be replaced by Prodigal59 for protein prediction to save time. The second most time-consuming step is P7 (substrate prediction for CGCs and CAZymes). Omitting substrate prediction reduces this step's time to about 15 minutes. The highest RAM usage is likely during P3 (read assembly), although specific RAM usage was not monitored during the protocol execution. diff --git a/docs/user_guide/run_from_raw_reads_ex.rst b/docs/user_guide/run_from_raw_reads_ex.rst new file mode 100644 index 000000000..bec216f1b --- /dev/null +++ b/docs/user_guide/run_from_raw_reads_ex.rst @@ -0,0 +1,546 @@ +More examples: Run from Raw Reads +================================= + +.. _wastyk_2021: + +Example 2: Wastyk2021 Dataset :cite:`2021:Wastyk` +------------------------------------------------- + +The Wastyk2021 dataset :cite:`2021:Wastyk` was published in 20211 from a human dietary intervention study. In the published paper, researchers studied how high-fermented and high-fiber diets influence the human microbiome metabolism and modulate the human immune status. Among various data analyses conducted in the paper1, CAZymes were mined from shotgun metagenomic reads of 18 healthy human participants, and each participant had four time points of stool samples for metagenome sequencing. CAZyme abundance profiles were compared before and after the high-fiber intervention (baseline vs high-fiber). One of the main findings from their CAZyme analysis was that high-fiber consumption increased the CAZyme abundance. For this protocol, we will select two samples (paired-end 2x146bp reads) of two time points (day 2 before high-fiber diet as baseline, and 10 weeks after high-fiber diet as intervention) from one participant (Table S2). The protocol is for the individual sample route (Fig. 3). + +The raw read data, intermediate data from each analysis step, and final result data and visualization files are organized in nested folders available on our website https://bcb.unl.edu/dbCAN_tutorial/dataset2-Wastyk2021/, Fig. 5) and https://dbcan.readthedocs.io. + +Procedure +--------- + +Module 1: Reads processing (Fig. 3) to obtain contigs +````````````````````````````````````````````````````` + +P1. Contamination Check (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Download the Wastyk2021 dataset: + +.. code-block:: shell + + wget https://bcb.unl.edu/dbCAN_tutorial/dataset2-Wastyk2021/fefifo_8022_1__shotgun_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset2-Wastyk2021/fefifo_8022_1__shotgun_2.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset2-Wastyk2021/fefifo_8022_7__shotgun_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset2-Wastyk2021/fefifo_8022_7__shotgun_2.fastq.gz + +Use `kraken2` to check for contaminated reads: + +.. code-block:: shell + + kraken2 --threads 32 --quick --paired --db K2 --report fefifo_8022_1.kreport --output fefifo_8022_1.kraken.output fefifo_8022_1__shotgun_1.fastq.gz fefifo_8022_1__shotgun_2.fastq.gz + kraken2 --threads 32 --quick --paired --db K2 --report fefifo_8022_7.kreport --output fefifo_8022_7.kraken.output fefifo_8022_7__shotgun_1.fastq.gz fefifo_8022_7__shotgun_2.fastq.gz + +Kraken2 found very little contamination in the data. Consequently, there was no need for the contamination removal step. + +If contamination is identified, users can align the reads to the reference genomes of potential contamination source organisms to remove +the aligned reads (Box 1). The most common source in human microbiome studies is from human hosts. + +Box 1: Example to remove contamination reads from human +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Kraken2 will produce the following output files: + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 1.1G Sep 21 23:17 fefifo_8022_1.kraken.output + -rw-rw-r-- 1 jinfang jinfang 991K Sep 21 23:19 fefifo_8022_1.kreport + -rw-rw-r-- 1 jinfang jinfang 574M Sep 21 23:21 fefifo_8022_7.kraken.output + -rw-rw-r-- 1 jinfang jinfang 949K Sep 21 23:22 fefifo_8022_7.kreport + + + Suppose from these files, we have identified humans as the contamination source, we can use the following commands to remove the contamination reads by aligning reads to the human reference genome. + + .. code-block:: shell + + wget https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + bwa index -p hg38 Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + bwa mem hg38 fefifo_8022_1__shotgun_1.fastq.gz fefifo_8022_1__shotgun_2.fastq.gz -t 32 -o fefifo_8022_1.hg38.sam + bwa mem hg38 fefifo_8022_7__shotgun_1.fastq.gz fefifo_8022_7__shotgun_2.fastq.gz -t 32 -o fefifo_8022_7.hg38.sam + samtools view -f 12 fefifo_8022_1.hg38.sam > fefifo_8022_1.hg38.unmap.bam + samtools view -f 12 fefifo_8022_7.hg38.sam > fefifo_8022_7.hg38.unmap.bam + samtools fastq -1 fefifo_8022_1_1.clean.fq.gz -2 fefifo_8022_1_2.clean.fq.gz fefifo_8022_1.hg38.unmap.bam + samtools fastq -1 fefifo_8022_7_1.clean.fq.gz -2 fefifo_8022_7_2.clean.fq.gz fefifo_8022_7.hg38.unmap.bam + +P2. Trim adapter and low-quality reads (TIMING ~20min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + trim_galore --paired fefifo_8022_1__shotgun_1.fastq.gz fefifo_8022_1__shotgun_2.fastq.gz --illumina -j 36 + trim_galore --paired fefifo_8022_7__shotgun_1.fastq.gz fefifo_8022_7__shotgun_2.fastq.gz --illumina -j 36 + +We specified --illumina to indicate that the reads were generated using the Illumina sequencing platform. +Nonetheless, trim_galore can automatically detect adapters, providing flexibility for users who may know the specific sequencing platform. +Details of trimming are available in the trimming report file (Box 2). + +Box 2: Example output of `trim_galore` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + In addition to the trimmed read files, `Trim_galore`` also generates a trimming report file. + The trimming report contains details on read trimming, such as the number of trimmed reads. + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 429M Oct 30 22:44 fefifo_8022_1__shotgun_1.fastq.gz + -rw-rw-r-- 1 jinfang jinfang 4.1K Oct 31 05:15 fefifo_8022_1__shotgun_1.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 390M Oct 31 05:16 fefifo_8022_1__shotgun_1_val_1.fq.gz + -rw-rw-r-- 1 jinfang jinfang 540M Oct 30 22:44 fefifo_8022_1__shotgun_2.fastq.gz + -rw-rw-r-- 1 jinfang jinfang 4.2K Oct 31 05:16 fefifo_8022_1__shotgun_2.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 499M Oct 31 05:16 fefifo_8022_1__shotgun_2_val_2.fq.gz + -rw-rw-r-- 1 jinfang jinfang 931M Oct 30 22:34 fefifo_8022_7__shotgun_1.fastq.gz + -rw-rw-r-- 1 jinfang jinfang 4.2K Oct 31 05:17 fefifo_8022_7__shotgun_1.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 861M Oct 31 05:20 fefifo_8022_7__shotgun_1_val_1.fq.gz + -rw-rw-r-- 1 jinfang jinfang 1.1G Oct 30 22:34 fefifo_8022_7__shotgun_2.fastq.gz + -rw-rw-r-- 1 jinfang jinfang 4.4K Oct 31 05:20 fefifo_8022_7__shotgun_2.fastq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 1003M Oct 31 05:20 fefifo_8022_7__shotgun_2_val_2.fq.gz + +.. warning:: + + During the trimming process, certain reads may be entirely removed due to low quality in its entirety. + Using the ``--retain_unpaired`` parameter in ``trim_galore`` allows for the preservation of single-end reads. + In this protocol, this option was not selected, so that both reads of a forward-revise pair were removed. + +P3. Assemble reads into contigs (TIMING ~84min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Use Megahit for assembling reads into contigs: + +.. code-block:: shell + + megahit -m 0.5 -t 32 -o megahit_fefifo_8022_1 -1 fefifo_8022_1__shotgun_1_val_1.fq.gz -2 fefifo_8022_1__shotgun_2_val_2.fq.gz --out-prefix fefifo_8022_1 --min-contig-len 1000 + megahit -m 0.5 -t 32 -o megahit_fefifo_8022_7 -1 fefifo_8022_7__shotgun_1_val_1.fq.gz -2 fefifo_8022_7__shotgun_2_val_2.fq.gz --out-prefix fefifo_8022_7 --min-contig-len 1000 + + +`MEGAHIT` generates two output folders `megahit_fefifo_8022_1` and `megahit_fefifo_8022_7`. +Each contains five files and one sub-folder (Box 3). `fefifo_8022_1.contigs.fa` is the final contig sequence file. +We set `--min-contig-len 1000`, a common practice to retain all contigs longer than 1,000 base pairs. + +Box 3: Example output of `MEGAHIT` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 262 Oct 31 05:49 checkpoints.txt + -rw-rw-r-- 1 jinfang jinfang 0 Oct 31 05:49 done + -rw-rw-r-- 1 jinfang jinfang 97M Oct 31 05:49 fefifo_8022_1.contigs.fa + -rw-rw-r-- 1 jinfang jinfang 149K Oct 31 05:49 fefifo_8022_1.log + drwxrwxr-x 2 jinfang jinfang 4.0K Oct 31 05:49 intermediate_contigs + -rw-rw-r-- 1 jinfang jinfang 1.1K Oct 31 05:27 options.json + + +P4. Predict genes by `Prokka` (TIMING ~40h) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + prokka --kingdom Bacteria --cpus 36 --outdir prokka_fefifo_8022_1 --prefix fefifo_8022_1 --addgenes --addmrna --locustag fefifo_8022_1 megahit_fefifo_8022_1/fefifo_8022_1.contigs.fa + prokka --kingdom Bacteria --cpus 36 --outdir prokka_fefifo_8022_7 --prefix fefifo_8022_7 --addgenes --addmrna --locustag fefifo_8022_7 megahit_fefifo_8022_7/fefifo_8022_7.contigs.fa + +The parameter `--kingdom Bacteria` is required for bacterial gene prediction. +To optimize performance, `--CPU 36` instructs the utilization of 36 computer processors. The output files comprise of both protein and CDS sequences in Fasta format (e.g., `fefifo_8022_1.faa` and `fefifo_8022_1.ffn` in Box 4). + +Box 4: Example output of `Prokka` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 181 Oct 31 22:28 errorsummary.val + -rw-rw-r-- 1 jinfang jinfang 2.6M Oct 31 22:28 fefifo_8022_1.err + -rw-rw-r-- 1 jinfang jinfang 31M Oct 31 22:09 fefifo_8022_1.faa + -rw-rw-r-- 1 jinfang jinfang 83M Oct 31 22:09 fefifo_8022_1.ffn + -rw-rw-r-- 1 jinfang jinfang 22K Oct 31 22:21 fefifo_8022_1.fixedproducts + -rw-rw-r-- 1 jinfang jinfang 98M Oct 31 21:50 fefifo_8022_1.fna + -rw-rw-r-- 1 jinfang jinfang 99M Oct 31 22:09 fefifo_8022_1.fsa + -rw-rw-r-- 1 jinfang jinfang 222M Oct 31 22:24 fefifo_8022_1.gbf + -rw-rw-r-- 1 jinfang jinfang 142M Oct 31 22:09 fefifo_8022_1.gff + -rw-rw-r-- 1 jinfang jinfang 692K Oct 31 22:29 fefifo_8022_1.log + -rw-rw-r-- 1 jinfang jinfang 406M Oct 31 22:22 fefifo_8022_1.sqn + -rw-rw-r-- 1 jinfang jinfang 26M Oct 31 22:09 fefifo_8022_1.tbl + -rw-rw-r-- 1 jinfang jinfang 12M Oct 31 22:09 fefifo_8022_1.tsv + -rw-rw-r-- 1 jinfang jinfang 131 Oct 31 22:09 fefifo_8022_1.txt + -rw-rw-r-- 1 jinfang jinfang 145K Oct 31 22:24 fefifo_8022_1.val + +Module 2. run_dbcan annotation (Fig. 3) to obtain CAZymes, CGCs, and substrates +``````````````````````````````````````````````````````````````````````````````` + +**CRITICAL STEP** + +Users can skip P5 and P6, and directly run P7 (much slower though), if they want to predict not only CAZymes and CGCs, but also substrates. + +P5. CAZyme annotation at the CAZyme family level (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + run_dbcan prokka_fefifo_8022_1/fefifo_8022_1.faa protein --hmm_cpu 32 --out_dir fefifo_8022_1.CAZyme --tools hmmer --db_dir db + run_dbcan prokka_fefifo_8022_7/fefifo_8022_7.faa protein --hmm_cpu 32 --out_dir fefifo_8022_7.CAZyme --tools hmmer --db_dir db + +Two arguments are required for ``run_dbcan``: the input sequence file (faa files) and the sequence type (protein). +By default, ``run_dbcan`` will use three methods (``HMMER`` vs ``dbCAN HMMdb``, ``DIAMOND`` vs ``CAZy``, ``HMMER`` vs ``dbCAN-sub HMMdb``) for +CAZyme annotation (Table 1, Fig. 2). This default setting is equivalent to the use ``--tools all`` parameter (Box 5). +Here we only invoke the ``HMMER`` vs ``dbCAN HMMdb`` for CAZyme annotation at the family level. + +Box 5: CAZyme annotation with default setting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the ``--tools`` parameter is not set, it defaults to the equivalent of ``--tools all``. +This setting will take a much longer time to finish (~5 hours) due to the large size of ``dbCAN-sub HMMdb`` +(used for substrate prediction for CAZymes, see Table 1). + +.. code-block:: shell + + run_dbcan prokka_fefifo_8022_1/fefifo_8022_1.faa protein --out_dir fefifo_8022_1.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 [--tools all] + run_dbcan prokka_fefifo_8022_7/fefifo_8022_7.faa protein --out_dir fefifo_8022_7.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 [--tools all] + + +The sequence type can be `protein`, `prok`, `meta`. If the input sequence file contains metagenomic contig sequences (`fna` file), +the sequence type has to be `meta`, and `prodigal` will be called to predict genes. + +.. code-block:: shell + + run_dbcan prokka_fefifo_8022_1/fefifo_8022_1.fna meta --out_dir fefifo_8022_1.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 + run_dbcan prokka_fefifo_8022_7/fefifo_8022_7.fna meta --out_dir fefifo_8022_7.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 + + +P6. CGC prediction (TIMING ~15 min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following commands will re-run run_dbcan to not only predict CAZymes but also CGCs with protein `faa` and gene location `gff` files. + +.. code-block:: shell + + run_dbcan prokka_fefifo_8022_1/fefifo_8022_1.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_fefifo_8022_1/fefifo_8022_1.gff --out_dir fefifo_8022_1.PUL --dia_cpu 32 --hmm_cpu 32 + run_dbcan prokka_fefifo_8022_7/fefifo_8022_7.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_fefifo_8022_7/fefifo_8022_7.gff --out_dir fefifo_8022_7.PUL --dia_cpu 32 --hmm_cpu 32 + +As mentioned above (see Table 1, Fig. 2), CGC prediction is a featured function added into dbCAN2 in 2018. +To identify CGCs with the protein sequence type, a gene location file (``gff``) must be provided together. If the input sequence type +is ``prok`` or ``meta``, meaning users only have contig ``fna`` files, the CGC prediction can be activated by setting the ``-c cluster`` parameter. + +.. warning:: + + **Creating own gff file** + If the users would like to create their own ``gff`` file (instead of using Prokka or Prodigal), + it is important to make sure the value of ID attribute in the ``gff`` file matches the protein ID in the protein ``faa`` file. + + **[Troubleshooting]CGC not found** + If no result is found in CGC output file, it is most likely because the sequence IDs in ``gff`` file and ``faa`` file do not match. + Another less likely reason is that the contigs are too short and fragmented and not suitable for CGC prediction. + + +P7. Substrate prediction for CAZymes and CGCs (TIMING ~5h) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following commands will re-run run_dbcan to predict CAZymes, CGCs, and their substrates with the `--cgc_substrate` parameter. + +.. code-block:: shell + + run_dbcan prokka_fefifo_8022_1/fefifo_8022_1.faa protein --dbcan_thread 32 --tf_cpu 32 --stp_cpu 32 -c prokka_fefifo_8022_1/fefifo_8022_1.gff --cgc_substrate --hmm_cpu 32 --out_dir fefifo_8022_1.dbCAN --dia_cpu 32 + run_dbcan prokka_fefifo_8022_7/fefifo_8022_7.faa protein --dbcan_thread 32 --tf_cpu 32 --stp_cpu 32 -c prokka_fefifo_8022_7/fefifo_8022_7.gff --cgc_substrate --hmm_cpu 32 --out_dir fefifo_8022_7.dbCAN --dia_cpu 32 + +The above commands do not set the `--tools` parameter, which means all three methods for CAZyme annotation will be activated (Box 5). Because dbCAN-sub HMMdb (for CAZyme substrate prediction) is 200 times larger than dbCAN HMMdb, the runtime will be much longer. Users can specify `--tools hmmer`, so that the HMMER search against dbCAN-sub will be disabled. However, this will turn off the substrate prediction for CAZymes and CGCs based on CAZyme substrate majority voting. Consequently, the substrate prediction will be solely based on homology search against PULs in dbCAN-PUL (Fig. 1, Table 1). + +.. code-block:: shell + + run_dbcan prokka_fefifo_8022_1/fefifo_8022_1.faa protein --tools hmmer --stp_cpu 32 -c prokka_fefifo_8022_1/fefifo_8022_1.gff --cgc_substrate --out_dir fefifo_8022_1.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + run_dbcan prokka_fefifo_8022_7/fefifo_8022_7.faa protein --tools hmmer --stp_cpu 32 -c prokka_fefifo_8022_7/fefifo_8022_7.gff --cgc_substrate --out_dir fefifo_8022_7.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + + +Box 6. Example output folder content of run_dbcan substrate prediction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + In the `fefifo_8022_1.dbCAN _` directory, a total of 17 files and 1 folder are generated: + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 39M Nov 1 22:18 PUL_blast.out + -rw-rw-r-- 1 jinfang jinfang 3.1M Nov 1 22:15 CGC.faa + -rw-rw-r-- 1 jinfang jinfang 6.9M Nov 1 22:15 cgc.gff + -rw-rw-r-- 1 jinfang jinfang 702K Nov 1 22:15 cgc.out + -rw-rw-r-- 1 jinfang jinfang 321K Nov 1 22:15 cgc_standard.out + -rw-rw-r-- 1 jinfang jinfang 1.5M Nov 1 22:15 cgc_standard.out.json + -rw-rw-r-- 1 jinfang jinfang 556K Nov 1 22:14 dbcan-sub.hmm.out + -rw-rw-r-- 1 jinfang jinfang 345K Nov 1 22:14 diamond.out + -rw-rw-r-- 1 jinfang jinfang 455K Nov 1 22:14 dtemp.out + -rw-rw-r-- 1 jinfang jinfang 298K Nov 1 22:14 hmmer.out + -rw-rw-r-- 1 jinfang jinfang 270K Nov 1 22:15 overview.txt + -rw-rw-r-- 1 jinfang jinfang 1.1M Nov 1 22:15 stp.out + -rw-rw-r-- 1 jinfang jinfang 54K Nov 1 22:18 substrate.out + drwxrwxr-x 2 jinfang jinfang 32K Nov 2 09:48 synteny.pdf + -rw-rw-r-- 1 jinfang jinfang 288K Nov 1 22:14 tf-1.out + -rw-rw-r-- 1 jinfang jinfang 237K Nov 1 22:14 tf-2.out + -rw-rw-r-- 1 jinfang jinfang 804K Nov 1 22:15 tp.out + -rw-rw-r-- 1 jinfang jinfang 31M Nov 1 21:07 uniInput + + + Descriptions of Output Files: + + - ``PUL_blast.out``: BLAST results between CGCs and PULs. + - ``CGC.faa``: CGC Fasta sequences. + - ``cgc.gff``: reformatted from the user input gff file by marking CAZymes, TFs, TCs, and STPs. + - ``cgc.out``: raw output of CGC predictions. + + 1. CGC_id: CGC1 + 2. type: CAZyme + 3. contig_id: k141_32617 + 4. gene_id: fefifo_8022_1_00137 + 5. start: 1755 + 6. end: 3332 + 7. strand: - + 8. annotation: GH13 + + **Explanation**: Explanation: the gene fefifo_8022_1_00137 encodes a GH13 CAZyme in the CGC1 of the contig k141_32617. CGC1 also has other genes, which are provided in other rows. fefifo_8022_1_00137 is on the negative strand of k141_32617 from 1755 to 3332. The type can be one of the four signature gene types (CAZymes, TCs, TFs, STPs) or the null type (not annotated as one of the four signature genes). + + - ``cgc_standard.out.json``: JSON format of cgc_standard.out. + - ``dbcan-sub.hmm.out``: HMMER search result against dbCAN-sub HMMdb, including a column with CAZyme substrates extracted from `fam-substrate-mapping-08012023.tsv`. + - ``diamond.out``: DIAMOND search result against the CAZy annotated protein sequences (`CAZyDB.07262023.fa`). + - ``dtemp.out``: temporary file. + - ``hmmer.out``: HMMER search result against dbCAN HMMdb. + - ``overview.txt``: summary of CAZyme annotation from three methods in TSV format. An example row has the following columns: + + 1. ``Gene_ID``: fefifo_8022_1_00719 + 2. ``EC#``: PL8_e13:2 + 3. ``dbCAN``: PL8_2(368-612) + 4. ``dbCAN_sub``: PL8_e13 + 5. ``DIAMOND``: PL8_2 + 6. ``#ofTools``: 3 + + **Explanation**: Explanation: the protein fefifo_8022_1_00719 is annotated by 3 tools to be a CAZyme: (1) PL8_2 (CAZy defined subfamily 2 of PL8) by HMMER vs dbCAN HMMdb with a domain range from aa position 368 to 612, (2) PL8_e13 (eCAMI defined subfamily e13; e indicates it is from eCAMI not CAZy) by HMMER vs dbCAN-sub HMMdb (derived from eCAMI subfamilies), and (3) PL8_2 by DIAMOND vs CAZy annotated protein sequences. The second column 4.2.2.20:2 is extracted from eCAMI, meaning that the eCAMI subfamily PL8_e13 contains two member proteins which have an EC 4.2.2.20 according to CAZy. In most cases, the 3 tools will have the same CAZyme family assignment. When they give different assignment. We recommend a preference order: dbCAN > eCAMI/dbCAN-sub > DIAMOND. See our dbCAN2 paper2, dbCAN3 paper3, and eCAMI4 for more details. + + **Note**: If users invoked the ``--use_signalP`` parameter when running run_dbcan, there will be an additional column called ``signalP`` in the overview.txt. + + - ``stp.out``: HMMER search result against the MiST5 compiled signal transduction protein HMMs from Pfam. + - ``tf-1.out``: HMMER search result against the DBD6 compiled transcription factor HMMs from Pfam 7. + - ``tf-2.out``: HMMER search result against the DBD compiled transcription factor HMMs from Superfamily 8. + - ``tp.out``: DIAMOND search result against the TCDB 9 annotated protein sequences. + - ``substrate.out``: summary of substrate prediction results for CGCs in TSV format from two approaches3 (dbCAN-PUL blast search and dbCAN-sub majority voting). An example row has the following columns: + + 1. ``CGC_ID``: k141_31366|CGC2 + 2. ``Best hit PUL_ID in dbCAN-PUL``: PUL0008 + 3. ``Substrate of the hit PUL``: fructan + 4. ``Sum of bitscores for homologous gene pairs between CGC and PUL``: 6132.0 + 5. ``Types of homologous gene pairs``: CAZyme-CAZyme;CAZyme-CAZyme;TC-TC;CAZyme-CAZyme;CAZyme-CAZyme;TC-TC + 6. ``Substrate predicted by majority voting of CAZymes in CGC``: fructan + 7. ``Voting score``: 2.0 + + **Explanation**: The CGC1 of contig ``k141_31366`` has its best hit ``PUL0008`` (from ``PUL_blast.out``) with fructan as substrate (from ``dbCAN-PUL_12-12-2023.xlsx``). Six signature genes are matched between ``k141_31366|CGC2 and PUL0008 (from PUL_blast.out)``: four are CAZymes and the other two are TCs. The sum of blast bitscores of the six homologous pairs (``CAZyme-CAZyme, CAZyme-CAZyme, TC-TC, CAZyme-CAZyme, CAZyme-CAZyme and TC-TC``) is 6132.0. Hence, the substrate of ``k141_31366|CGC2`` is predicted to be fructan according to dbCAN-PUL blast search. The last two columns are based on the dbCAN-sub result (``dbcan-sub.hmm.out``), according to which two CAZymes in ``k141_31366|CGC2`` are predicted to have fructan substrate. The voting score is thus 2.0, so that according to the majority voting rule, ``k141_31366|CGC2`` is predicted to have a fructan substrate. + + *Note*: : for many CGCs, only one of the two approaches produces substrate prediction. In some cases, the two approaches produce different substrate assignments. We recommend a preference order: ``dbCAN-PUL blast search > dbCAN-sub`` majority voting. See our `dbCAN3 _` :cite:`2023:dbCAN3` paper3 for more details. + + - ``synteny.pdf``: a folder with syntenic block alignment plots between all CGCs and PULs. + - ``uniInput``: renamed Fasta file from input protein sequence file. + + +Module 3. Read mapping (Fig. 3) to calculate abundance for CAZyme families, subfamilies, CGCs, and substrates +`````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +P8. Read mapping to all CDS of each sample (TIMING ~10 min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + bwa index prokka_fefifo_8022_1/fefifo_8022_1.ffn + bwa index prokka_fefifo_8022_7/fefifo_8022_7.ffn + mkdir samfiles + bwa mem -t 32 -o samfiles/fefifo_8022_1.CDS.sam prokka_fefifo_8022_1/fefifo_8022_1.ffn fefifo_8022_1__shotgun_1_val_1.fq.gz fefifo_8022_1__shotgun_2_val_2.fq.gz + bwa mem -t 32 -o samfiles/fefifo_8022_7.CDS.sam prokka_fefifo_8022_7/fefifo_8022_7.ffn fefifo_8022_7__shotgun_1_val_1.fq.gz fefifo_8022_7__shotgun_2_val_2.fq.gz + + +Reads are mapped to the ``ffn`` files from Prokka. + + +P9. Read mapping to all contigs of each sample (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + bwa index megahit_fefifo_8022_1/fefifo_8022_1.contigs.fa + bwa index megahit_fefifo_8022_7/fefifo_8022_7.contigs.fa + bwa mem -t 32 -o samfiles/fefifo_8022_1.sam megahit_fefifo_8022_1/fefifo_8022_1.contigs.fa fefifo_8022_1__shotgun_1_val_1.fq.gz fefifo_8022_1__shotgun_2_val_2.fq.gz + bwa mem -t 32 -o samfiles/fefifo_8022_7.sam megahit_fefifo_8022_7/fefifo_8022_7.contigs.fa fefifo_8022_7__shotgun_1_val_1.fq.gz fefifo_8022_7__shotgun_2_val_2.fq.gz + + +Reads are mapped to the `contig` files from MEGAHIT. + +P10. Sort SAM files by coordinates (TIMING ~6min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + cd samfiles + samtools sort -@ 32 -o fefifo_8022_1.CDS.bam fefifo_8022_1.CDS.sam + samtools sort -@ 32 -o fefifo_8022_7.CDS.bam fefifo_8022_7.CDS.sam + samtools sort -@ 32 -o fefifo_8022_1.bam fefifo_8022_1.sam + samtools sort -@ 32 -o fefifo_8022_7.bam fefifo_8022_7.sam + rm -rf *sam + cd .. + + +P11. Read count calculation for all proteins of each sample using Bedtools (TIMING ~1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + mkdir fefifo_8022_1_abund && cd fefifo_8022_1_abund + seqkit fx2tab -l -n -i ../prokka_fefifo_8022_1/fefifo_8022_1.ffn | awk '{print $1"\t"$2}' > fefifo_8022_1.length + seqkit fx2tab -l -n -i ../prokka_fefifo_8022_1/fefifo_8022_1.ffn | awk '{print $1"\t"0"\t"$2}' > fefifo_8022_1.bed + bedtools coverage -g fefifo_8022_1.length -sorted -a fefifo_8022_1.bed -counts -b ../samfiles/fefifo_8022_1.CDS.bam > fefifo_8022_1.depth.txt + + cd .. && mkdir fefifo_8022_7_abund && cd fefifo_8022_7_abund + seqkit fx2tab -l -n -i ../prokka_fefifo_8022_7/fefifo_8022_7.ffn | awk '{print $1"\t"$2}' > fefifo_8022_7.length + seqkit fx2tab -l -n -i ../prokka_fefifo_8022_7/fefifo_8022_7.ffn | awk '{print $1"\t"0"\t"$2}' > fefifo_8022_7.bed + bedtools coverage -g fefifo_8022_7.length -sorted -a fefifo_8022_7.bed -counts -b ../samfiles/fefifo_8022_7.CDS.bam > fefifo_8022_7.depth.txt + cd .. + + +Read counts are saved in ``depth.txt`` files of each sample. + + +P12. Read count calculation for a given region of contigs using Samtools (TIMING ~1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + cd fefifo_8022_1_abund + samtools index ../samfiles/fefifo_8022_1.bam + samtools depth -r k141_2168:4235-19858 ../samfiles/fefifo_8022_1.bam > fefifo_8022_1.cgc.depth.txt + cd .. + + +The parameter ``-r k141_2168:4235-19858`` specifies a region in a contig. For any CGC, its positional range can be found in the file ``cgc_standard.out`` produced by run_dbcan (Box 6). The ``depth.txt`` files contain the raw read counts for the specified region. + + +.. warning:: + + The contig IDs are automatically generated by MEGAHIT. There is a small chance that a same contig ID appears in both samples. However, the two contigs in the two samples do not match each other even the ID is the same. For example, the contig ID ``k141_2168`` is most likely only found in the ``fefifo_8022_1`` sample. Even if there is a ``k141_2168`` in ``fefifo_8022_7``, the actual contigs in two samples are different. + +P13. dbcan_utils to calculate the abundance of CAZyme families, subfamilies, CGCs, and substrates (TIMING ~1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + dbcan_utils fam_abund -bt fefifo_8022_1.depth.txt -i ../fefifo_8022_1.dbCAN -a TPM + dbcan_utils fam_substrate_abund -bt fefifo_8022_1.depth.txt -i ../fefifo_8022_1.dbCAN -a TPM + dbcan_utils CGC_abund -bt fefifo_8022_1.depth.txt -i ../fefifo_8022_1.dbCAN -a TPM + dbcan_utils CGC_substrate_abund -bt fefifo_8022_1.depth.txt -i ../fefifo_8022_1.dbCAN -a TPM + + cd .. && cd fefifo_8022_7_abund + dbcan_utils fam_abundfam_substrate_abund -bt fefifo_8022_7.depth.txt -i ../fefifo_8022_7.dbCAN -a TPM + dbcan_utils fam_substrate_abund -bt fefifo_8022_7.depth.txt -i ../fefifo_8022_7.dbCAN -a TPM + dbcan_utils CGC_abund -bt fefifo_8022_7.depth.txt -i ../fefifo_8022_7.dbCAN -a TPM + dbcan_utils CGC_substrate_abund -bt fefifo_8022_7.depth.txt -i ../fefifo_8022_7.dbCAN -a TPM + cd .. + + +We developed a set of Python scripts as ``dbcan_utils`` (included in the ``run_dbcan`` package) to take the raw read counts for all genes as input and output the normalized abundances (refer to Box 7) of CAZyme families, subfamilies, CGCs, and substrates (see Fig. 4). The parameter ``-a TPM`` can also be set to two other metrics: RPM, or RPKM61. + +- **RPKM** is calculated as the number of mapped reads to a gene G divided by [(total number of mapped reads to all genes / 10^6) x (gene G length / 1000)]. +- **RPM** is the number of mapped reads to a gene G divided by (total number of mapped reads to all genes / 10^6). +- **TPM** is calculated as [number of mapped reads to a gene G / (gene G length / 1000)] divided by the sum of [number of mapped reads to each gene / (the gene length / 1000)]. + + +Box 7. Example output of dbcan_utils +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As an example, `fefifo_8022_1_abund _` folder has 7 TSV files: + +.. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 178K Jan 2 04:08 CGC_abund.out + -rw-rw-r-- 1 jinfang jinfang 3.3K Jan 2 04:08 CGC_substrate_majority_voting.out + -rw-rw-r-- 1 jinfang jinfang 12K Jan 2 04:08 CGC_substrate_PUL_homology.out + -rw-rw-r-- 1 jinfang jinfang 2.5K Jan 2 04:08 EC_abund.out + -rw-rw-r-- 1 jinfang jinfang 4.1K Jan 2 04:08 fam_abund.out + -rw-rw-r-- 1 jinfang jinfang 42K Jan 2 04:08 fam_substrate_abund.out + -rw-rw-r-- 1 jinfang jinfang 26K Jan 2 04:08 subfam_abund.out + +Explanation of columns in these TSV files is as follows: + + - ``fam_abund.out``: CAZy family (from HMMER vs dbCAN HMMdb), sum of TPM, number of CAZymes in the family. + - ``subfam_abund.out``: eCAMI subfamily (from HMMER vs dbCAN-sub HMMdb), sum of TPM, number of CAZymes in the subfamily. + - ``EC_abund.out``: EC number (extracted from dbCAN-sub subfamily), sum of TPM, number of CAZymes with the EC. + - ``fam_substrate_abund.out``: Substrate (from HMMER vs dbCAN-sub HMMdb), sum of TPM (all CAZymes in this substrate group), GeneID (all CAZyme IDs in this substrate group). + - ``CGC_abund.out``: CGC_ID (e.g., k141_338400|CGC1), mean of TPM (all genes in the CGC), Seq_IDs (IDs of all genes in the CGC), TPM (of all genes in the CGC), Families (CAZyme family or other signature gene type of all genes in the CGC). + - ``CGC_substrate_PUL_homology.out``: Substrate (from dbCAN-PUL blast search), sum of TPM, CGC_IDs (all CGCs predicted to have the substrate from dbCAN-PUL blast search), TPM (of CGCs in this substrate group). + - ``CGC_substrate_majority_voting.out``: Substrate (from dbCAN-sub majority voting), sum of TPM, CGC_IDs (all CGCs predicted to have the substrate from dbCAN-sub majority voting), TPM (of CGCs in this substrate group). + + +Module 4: dbcan_plot for data visualization (Fig. 3) of abundances of CAZymes, CGCs, and substrates (TIMING variable) +````````````````````````````````````````````````````````````````````````````````````````````````````````````````````` + +**CRITICAL STEP** + +To visualize the CAZyme annotation result, we provide a set of Python scripts as dbcan_plot to make publication quality plots with the dbcan_utils results as the input. The dbcan_plot scripts are included in the run_dbcan package. Once the plots are made in PDF format, they can be transferred to users' Windows or Mac computers for visualization. + +Five data folders will be needed as the input for ``dbcan_plot``: + +1. two abundance folders ``fefifo_8022_1_abund`` and ``fefifo_8022_7_abund``, +2. two CAZyme annotation ``folders fefifo_8022_1.dbCAN`` and ``fefifo_8022_7.dbCAN``, and +3. the ``dbCAN-PUL folder`` (under the db folder, released from ``dbCAN-PUL.tar.gz``). + +P14. Heatmap for CAZyme substrate abundance across samples (Fig. S4B) (TIMING 1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + dbcan_plot heatmap_plot --samples fefifo_8022_1,fefifo_8022_7 -i fefifo_8022_1_abund/ fam_substrate_abund.out, fefifo_8022_7_abund/fam_substrate_abund.out --show_abund --top 20 + + +Here we plot the top 20 substrates in the two samples. The input files are the two CAZyme substrate abundance files calculated based on dbCAN-sub result. The default heatmap is ranked by substrate abundances. To rank the heatmap according to abundance profile using the function clustermap of seaborn package, users can invoke the ``--cluster_map`` parameter. + +P15. Barplot for CAZyme family/subfamily/EC abundance across samples (Fig. S4C) (TIMING 1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + dbcan_plot bar_plot --samples fefifo_8022_1,fefifo_8022_7 --vertical_bar --top 20 -i fefifo_8022_1_abund/fam_abund.out,fefifo_8022_7_abund/fam_abund.out + +Users can choose to generate a barplot instead of heatmap using the ``bar_plot`` method. + +P16. Synteny plot between a CGC and its best PUL hit with read mapping coverage to CGC (Fig. S4A) (TIMING 1min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + dbcan_plot CGC_synteny_coverage_plot -i fefifo_8022_1.dbCAN --cgcid 'k141_2168|CGC1' --readscount fefifo_8022_1_abund/fefifo_8022_1.cgc.depth.txt + + +The ``fefifo_8022_1.dbCAN`` folder contains the ``PUL_blast.out`` file. Using this file, the ``cgc_standard.out`` file, +and the best PUL's ``gff`` file in ``dbCAN-PUL.tar.gz``, the CGC_synteny_plot method will create the ``CGC-PUL synteny plot``. +The ``-cgcid`` parameter is required to specify which CGC to be plotted (``'k141_2168|CGC1'`` in this example). +The ``fefifo_8022_1.cgc.depth.txt`` file is used to plot the read mapping coverage. + +If users only want to plot the CGC structure: + +.. code-block:: shell + + dbcan_plot CGC_plot -i fefifo_8022_1.dbCAN --cgcid 'k141_2168|CGC1' + +If users only want to plot the CGC structure plus the read mapping coverage: + +.. code-block:: shell + + dbcan_plot CGC_coverage_plot -i fefifo_8022_1.dbCAN --cgcid 'k141_2168|CGC1' --readscount fefifo_8022_1_abund/fefifo_8022_1.cgc.depth.txt + +If users only want to plot the synteny between the CGC and PUL: + +.. code-block:: shell + + dbcan_plot CGC_synteny_plot -i fefifo_8022_1.dbCAN --cgcid 'k141_2168|CGC1' + + +.. warning:: + + The CGC IDs in different samples do not match each other. For example, specifying ``-i fefifo_8022_1.dbCAN`` is to plot + the ``'k141_2168|CGC1'`` in the fefifo_8022_1 sample. The ``'k141_2168|CGC1'`` in the fefifo_8022_7 sample most likely does not exist, + and even it does, the CGC has a different sequence even if the ID is the same. + + +.. _priest_2023: + +Example 3: Priest2023 Dataset :cite:`2023:Priest` +------------------------------------------------- diff --git a/docs/user_guide/run_from_raw_reads_pr.rst b/docs/user_guide/run_from_raw_reads_pr.rst new file mode 100644 index 000000000..a3df25ef9 --- /dev/null +++ b/docs/user_guide/run_from_raw_reads_pr.rst @@ -0,0 +1,369 @@ +More examples: Run from Raw Reads +================================= + +.. _priest_2023: + +Example 3: Priest2023 Dataset :cite:`2023:Priest` +------------------------------------------------- + +The Wastyk2021 dataset :cite:`2023:Priest` was published in 2021 from a human dietary intervention study. In the published paper, researchers studied how high-fermented and high-fiber diets influence the human microbiome metabolism and modulate the human immune status. +Among various data analyses conducted in the paper, +CAZymes were mined from shotgun metagenomic reads of 18 healthy human participants, and each participant had four time points of stool samples for metagenome sequencing. +CAZyme abundance profiles were compared before and after the high-fiber intervention (baseline vs high-fiber). One of the main findings from their CAZyme analysis was that high-fiber consumption increased the CAZyme abundance. +For this protocol, we will select two samples (paired-end 2x146bp reads) of two time points (day 2 before high-fiber diet as baseline, and 10 weeks after high-fiber diet as intervention) from one participant +The protocol is for the individual sample route. + +============ =========== =========== +Header 1 Header 2 Header 3 +============ =========== =========== +row 1, col 1 row 1, col 2 row 1, col 3 +row 2, col 1 row 2, col 2 row 2, col 3 +============ =========== =========== + + +The Priest2023 dataset :cite:`2021:Wastyk` was published in 20211 from a human dietary intervention study. In the published paper, researchers studied how high-fermented and high-fiber diets influence the human microbiome metabolism and modulate the human immune status. Among various data analyses conducted in the paper1, CAZymes were mined from shotgun metagenomic reads of 18 healthy human participants, and each participant had four time points of stool samples for metagenome sequencing. CAZyme abundance profiles were compared before and after the high-fiber intervention (baseline vs high-fiber). One of the main findings from their CAZyme analysis was that high-fiber consumption increased the CAZyme abundance. For this protocol, we will select two samples (paired-end 2x146bp reads) of two time points (day 2 before high-fiber diet as baseline, and 10 weeks after high-fiber diet as intervention) from one participant (Table S2). The protocol is for the individual sample route (Fig. 3). + +Procedure +--------- + +Module 1: Reads processing (Fig. 3) to obtain contigs +````````````````````````````````````````````````````` + +P1. Contamination Check (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Download the Priest2023 dataset: +.. code-block:: shell + + wget https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/BML_MG.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/SRF_MG.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/BML_MT_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/BML_MT_2.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/SRF_MT_1.fastq.gz + wget https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/SRF_MT_2.fastq.gz + +Use `kraken2` to check for contaminated reads: + + +.. code-block:: shell + + kraken2 --threads 32 --quick --paired --db K2 --report SRF_MT.kreport --output SRF_MT.kraken.output SRF_MT_1.fastq.gz SRF_MT_2.fastq.gz + kraken2 --threads 32 --quick --paired --db K2 --report BML_MT.kreport --output BML_MT.kraken.output BML_MT_1.fastq.gz BML_MT_2.fastq.gz + + kraken2 --threads 32 --quick --paired --db K2 --report SRF_MG.kreport --output SRF_MG.kraken.output SRF_MG_1.fastq.gz SRF_MG_2.fastq.gz + kraken2 --threads 32 --quick --paired --db K2 --report BML_MG.kreport --output BML_MG.kraken.output BML_MG_1.fastq.gz BML_MG_2.fastq.gz + +Kraken2 found much contamination (Box 1) from human in the Priest2023 data. Consequently, human reads need to be removed before assembly. + +Reads can be aligned to the reference genomes of potential contamination source organisms to remove the aligned reads. The most common source in microbiome studies is from human. + +Box 1: Example of Kraken2 output files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + The `kreport` files can be examined to identify potential contamination source organisms. + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 54M Dec 27 07:42 BML_MG.kraken.output + -rw-rw-r-- 1 jinfang jinfang 1.2M Dec 27 07:42 BML_MG.kreport + -rw-rw-r-- 1 jinfang jinfang 3.4G Dec 27 08:01 BML_MT.kraken.output + -rw-rw-r-- 1 jinfang jinfang 1023K Dec 27 08:02 BML_MT.kreport + -rw-rw-r-- 1 jinfang jinfang 61M Dec 27 07:39 SRF_MG.kraken.output + -rw-rw-r-- 1 jinfang jinfang 1.2M Dec 27 07:39 SRF_MG.kreport + -rw-rw-r-- 1 jinfang jinfang 2.6G Dec 27 07:50 SRF_MT.kraken.output + -rw-rw-r-- 1 jinfang jinfang 1.1M Dec 27 07:51 SRF_MT.kreport + + +P2. Remove contamination reads from human (TIMING ~40min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +From the Kraken2 output files, we identified humans as the contamination source, we can use the following commands to remove the contamination reads by aligning reads to the human reference genome. + +.. code-block:: shell + + mkdir hg38 && cd hg38 && wget https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + cd .. && mkdir contamination && cd contamination + minimap2 -a -x map-hifi -MD -t 32 -o SRF_MG.hg38.sam ../hg38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz ../SRF_MG.fastq.gz + minimap2 -a -x map-hifi -MD -t 32 -o BML_MG.hg38.sam ../hg38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz ../BML_MG.fastq.gz + samtools fastq -f 4 -@ 32 -0 ../SRF_MG.clean.fq.gz SRF_MG.hg38.sam + samtools fastq -f 4 -@ 32 -0 ../BML_MG.clean.fq.gz BML_MG.hg38.sam + bwa mem ../hg38/hg38 ../SRF_MT_1.fastq.gz ../SRF_MT_2.fastq.gz -t 32 -o SRF_MT.hg38.sam + bwa mem ../hg38/hg38 ../BML_MT_1.fastq.gz ../BML_MT_2.fastq.gz -t 32 -o BML_MT.hg38.sam + samtools fastq -f 12 -@ 32 -1 ../SRF_MT_1.clean.fq.gz -2 ../SRF_MT_2.clean.fq.gz SRF_MT.hg38.sam + samtools fastq -f 12 -@ 32 -1 ../BML_MT_1.clean.fq.gz -2 ../BML_MT_2.clean.fq.gz BML_MT.hg38.sam + cd .. + +P3| Trim adapter and low-quality reads (TIMING ~20min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. code-block:: shell + + trim_galore --illumina -j 8 --paired BML_MT_1.clean.fastq.gz BML_MT_2.clean.fastq.gz + trim_galore --illumina -j 8 --paired SRF_MT_1.clean.fastq.gz SRF_MT_2.clean.fastq.gz + +The HiFi long reads do not need to be trimmed. Hence, this step only applies to MT illumina short read data. We specified --illumina to indicate that the reads were generated using the Illumina sequencing platform. Nonetheless, trim_galore possesses the ability to automatically detect the adapter, providing flexibility in adapter handling for users who may know the specific sequencing platform. Details of trimming are available in the trimming report file (Box 2). + +Box 2: Example output of trim_galore +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In addition to the trimmed read files, Trim_galore also generates a trimming report file. The trimming report contains details on read trimming, such as the number of trimmed reads. + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 4.2K Dec 28 21:56 BML_MT_1.clean.fq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 2.3G Dec 28 22:05 BML_MT_1.clean_val_1.fq.gz + -rw-rw-r-- 1 jinfang jinfang 4.7K Dec 28 22:05 BML_MT_2.clean.fq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 3.0G Dec 28 22:05 BML_MT_2.clean_val_2.fq.gz + -rw-rw-r-- 1 jinfang jinfang 4.9K Dec 28 10:07 SRF_MT_1.clean.fq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 2.7G Dec 28 10:19 SRF_MT_1.clean_val_1.fq.gz + -rw-rw-r-- 1 jinfang jinfang 5.1K Dec 28 10:19 SRF_MT_2.clean.fq.gz_trimming_report.txt + -rw-rw-r-- 1 jinfang jinfang 3.3G Dec 28 10:19 SRF_MT_2.clean_val_2.fq.gz + +.. warning:: + + During the trimming process, certain reads may be entirely removed due to low quality in its entirety. Using the `--retain_unpaired` parameter in trim_galore allows for the preservation of single-end reads. In this protocol, this option was not selected, so that both reads of a forward-revise pair were removed. + + + +P4. Assemble HiFi reads into metagenome (TIMING ~4h20min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Flye was used to assemble the HiFi long reads into contigs. + +.. code-block:: shell + + flye --threads 32 --meta --pacbio-hifi BML_MG.clean.fq.gz --hifi-error 0.01 --keep-haplotypes --out-dir flye_BML_MG + flye --threads 32 --meta --pacbio-hifi SRF_MG.clean.fq.gz --hifi-error 0.01 --keep-haplotypes --out-dir flye_SRF_MG + +Flye generates two folders `flye_BML_MG` and `flye_SRF_MG`. Each folder +contains 6 files and 5 sub-folders (Box 3), among them `assembly.fasta` is the final contig sequence file. +We set `--hifi-error` 0.01, a generally accepted error rate of HiFi sequencing. +Parameter `--meta` is set to assemble reads into metagenomes. + +Box 3: Example output of Flye +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. code-block:: shell + + drwxrwxr-x 2 jinfang jinfang 4.0K Dec 27 20:15 00-assembly + drwxrwxr-x 2 jinfang jinfang 4.0K Dec 27 20:43 10-consensus + drwxrwxr-x 2 jinfang jinfang 4.0K Dec 27 21:14 20-repeat + drwxrwxr-x 2 jinfang jinfang 4.0K Dec 27 21:16 30-contigger + drwxrwxr-x 2 jinfang jinfang 4.0K Dec 27 22:06 40-polishing + -rw-rw-r-- 1 jinfang jinfang 314M Dec 27 22:06 assembly.fasta + -rw-rw-r-- 1 jinfang jinfang 311M Dec 27 22:06 assembly_graph.gfa + -rw-rw-r-- 1 jinfang jinfang 6.6M Dec 27 22:06 assembly_graph.gv + -rw-rw-r-- 1 jinfang jinfang 867K Dec 27 22:06 assembly_info.txt + -rw-rw-r-- 1 jinfang jinfang 61M Dec 27 22:06 flye.log + -rw-rw-r-- 1 jinfang jinfang 92 Dec 27 22:06 params.json + + +P5. Predict genes by Prokka (~21h) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + prokka --outdir prokka_BML_MG --prefix BML_MG --addgenes --addmrna --locustag BML_MG --kingdom Bacteria --cpus 36 flye_BML_MG/assembly.fasta + prokka --outdir prokka_SRF_MG --prefix SRF_MG --addgenes --addmrna --locustag SRF_MG --kingdom Bacteria --cpus 36 flye_SRF_MG/assembly.fasta + +The parameter `--kingdom` Bacteria is required for bacterial gene prediction. +To optimize performance, `--CPU` 36 instructs the utilization of 36 computer processors. +The output files comprise of both protein and CDS sequences in Fasta format (e.g., `BML_MG.faa` and `SRF_MG.ffn` in Box 4). + +Box 3: Example output of Prokka +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 2.2M Dec 28 05:38 BML_MG.err + -rw-rw-r-- 1 jinfang jinfang 105M Dec 27 23:26 BML_MG.faa + -rw-rw-r-- 1 jinfang jinfang 288M Dec 27 23:26 BML_MG.ffn + -rw-rw-r-- 1 jinfang jinfang 314M Dec 27 22:06 BML_MG.fna + -rw-rw-r-- 1 jinfang jinfang 315M Dec 27 23:26 BML_MG.fsa + -rw-rw-r-- 1 jinfang jinfang 724M Dec 28 05:39 BML_MG.gbk + -rw-rw-r-- 1 jinfang jinfang 467M Dec 27 23:26 BML_MG.gff + -rw-rw-r-- 1 jinfang jinfang 1.9M Dec 28 05:39 BML_MG.log + -rw-rw-r-- 1 jinfang jinfang 1.5G Dec 28 05:39 BML_MG.sqn + -rw-rw-r-- 1 jinfang jinfang 89M Dec 27 23:26 BML_MG.tbl + -rw-rw-r-- 1 jinfang jinfang 40M Dec 27 23:26 BML_MG.tsv + -rw-rw-r-- 1 jinfang jinfang 152 Dec 27 23:26 BML_MG.txt + + +Module 1: run_dbcan annotation (Fig. 3) to obtain CAZymes, CGCs, and substrates +``````````````````````````````````````````````````````````````````````````````````````````````` + +Users can skip P6 and P7, and directly run P8 (much slower though), if they want to predict not only CAZymes and CGCs, but also substrates. + +P6. CAZyme annotation at family level (TIMING ~10min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: shell + + run_dbcan prokka_BML_MG/BML_MG.faa protein --hmm_cpu 32 --out_dir BML_MG.CAZyme --tools hmmer --db_dir db + run_dbcan prokka_SRF_MG/SRF_MG.faa protein --hmm_cpu 32 --out_dir SRF_MG.CAZyme --tools hmmer --db_dir db + +Two arguments are required for run_dbcan: the input sequence file (faa) and the sequence type (protein). By default, run_dbcan will use three methods (HMMER vs dbCAN HMMdb, DIAMOND vs CAZy, HMMER vs dbCAN-sub HMMdb) for CAZyme annotation (Table 1, Fig. 2). This default setting is equivalent to the use --tools all parameter (Box 5). Here we only invoke the HMMER vs dbCAN HMMdb for CAZyme annotation at the family level. + + +Box 3: CAZyme annotation with default setting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + If the `--tools` parameter is not set, it is the default setting, which is the same as `--tools` all. + This will take much longer time to finish (~5h) due to the large size of dbCAN-sub HMMdb (used for substrate prediction for CAZymes, see Table 1). + + .. code-block:: shell + + run_dbcan prokka_BML_MG/BML_MG.faa protein --out_dir BML_MG.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 --tools all + run_dbcan prokka_SRF_MG/SRF_MG.faa protein --out_dir SRF_MG.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 --tools all + + The sequence type can be protein, prok, meta. If the input sequence file contains metagenomic contig sequences (fna file), + the sequence type has to be meta, and prodigal will be called to predict genes. + + .. code-block:: shell + + run_dbcan prokka_BML_MG/BML_MG.fna meta --out_dir BML_MG.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 + run_dbcan prokka_SRF_MG/SRF_MG.fna meta --out_dir SRF_MG.CAZyme --dia_cpu 32 --hmm_cpu 32 --dbcan_thread 32 + +P7. CGC prediction (TIMING ~15 min) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following commands will re-run run_dbcan to not only predict CAZymes but also CGCs with protein faa and gene location gff files. + + .. code-block:: shell + + run_dbcan prokka_BML_MG/BML_MG.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_BML_MG/BML_MG.gff --out_dir BML_MG.PUL --dia_cpu 32 --hmm_cpu 32 + run_dbcan prokka_SRF_MG/SRF_MG.faa protein --tools hmmer --tf_cpu 32 --stp_cpu 32 -c prokka_SRF_MG/SRF_MG.gff --out_dir SRF_MG.PUL --dia_cpu 32 --hmm_cpu 32 + +As mentioned above (Table 1, Fig. 2), CGC prediction is a featured function added into dbCAN2 in 2018. +To identify CGCs with the protein sequence type, a gene location file (gff) must be provided together. +If the input sequence type is prok or meta, meaning users only have contig fna files, +the CGC prediction can be activated by setting `-c cluster`. + +.. warning:: + + **CAUTION ** + + If the users would like to create their own gff file (instead of using Prokka or Prodigal), + it is important to make sure the value of ID attribute in the gff file matches the protein ID in the protein faa file. + + **Troubleshooting** + + If no result is found in CGC output file, it is most likely because the sequence IDs in gff file and faa file do not match. + Another less likely reason is that the contigs are too short and fragmented and not suitable for CGC prediction. + +P8. Substrate prediction for CAZymes and CGCs (TIMING ~5h) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following commands will re-run run_dbcan to predict CAZymes, CGCs, +and their substrates with the `--cgc_substrate` parameter. + +.. code-block:: shell + run_dbcan prokka_BML_MG/BML_MG.faa protein --dbcan_thread 32 --tf_cpu 32 --stp_cpu 32 -c prokka_BML_MG/BML_MG.gff --cgc_substrate --hmm_cpu 32 --out_dir BML_MG.dbCAN --dia_cpu 32 + run_dbcan prokka_SRF_MG/SRF_MG.faa protein --dbcan_thread 32 --stp_cpu 32 -c prokka_SRF_MG/SRF_MG.gff --cgc_substrate --out_dir SRF_MG.dbCAN --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + +.. warning:: + + The above commands do not set the `--tools` parameter, + which means all three methods for CAZyme annotation will be activated (Box 5). + Because dbCAN-sub HMMdb (for CAZyme substrate prediction) is 200 times larger than dbCAN HMMdb, + the runtime will be much longer. Users can specify `--tools` hmmer, + so that the HMMER search against dbCAN-sub will be disabled. + However, this will turn off the substrate prediction for CAZymes and CGCs based on CAZyme substrate majority voting. + Consequently, + the substrate prediction will be solely based on homology search against PULs in dbCAN-PUL (Fig. 1, Table 1). + +.. code-block:: shell + + run_dbcan prokka_BML_MG/BML_MG.faa protein --tools hmmer --stp_cpu 32 -c prokka_BML_MG/BML_MG.gff --cgc_substrate --out_dir BML_MG.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + run_dbcan prokka_SRF_MG/SRF_MG.faa protein --tools hmmer --stp_cpu 32 -c prokka_SRF_MG/SRF_MG.gff --cgc_substrate --out_dir SRF_MG.PUL.Sub --dia_cpu 32 --hmm_cpu 32 --tf_cpu 32 + +Box 6: Example output folder content of run_dbcan substrate prediction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + In the output directory (https://bcb.unl.edu/dbCAN_tutorial/dataset3-Priest2023/BML_MG.dbCAN/), a total of 17 files and 1 folder are generated: + + .. code-block:: shell + + -rw-rw-r-- 1 jinfang jinfang 9.6M Dec 28 10:18 PUL_blast.out + -rw-rw-r-- 1 jinfang jinfang 1.8M Dec 28 10:18 CGC.faa + -rw-rw-r-- 1 jinfang jinfang 26M Dec 28 10:18 cgc.gff + -rw-rw-r-- 1 jinfang jinfang 450K Dec 28 10:18 cgc.out + -rw-rw-r-- 1 jinfang jinfang 212K Dec 28 10:18 cgc_standard.out + -rw-rw-r-- 1 jinfang jinfang 1005K Dec 28 10:18 cgc_standard.out.json + -rw-rw-r-- 1 jinfang jinfang 406K Dec 28 10:11 dbcan-sub.hmm.out + -rw-rw-r-- 1 jinfang jinfang 325K Dec 28 10:11 diamond.out + -rw-rw-r-- 1 jinfang jinfang 332K Dec 28 10:11 dtemp.out + -rw-rw-r-- 1 jinfang jinfang 220K Dec 28 10:11 hmmer.out + -rw-rw-r-- 1 jinfang jinfang 240K Dec 28 10:18 overview.txt + -rw-rw-r-- 1 jinfang jinfang 1.7M Dec 28 10:17 stp.out + -rw-rw-r-- 1 jinfang jinfang 17K Dec 28 10:18 substrate.out + drwxrwxr-x 2 jinfang jinfang 12K Dec 28 10:19 synteny.pdf + -rw-rw-r-- 1 jinfang jinfang 293K Dec 28 10:13 tf-1.out + -rw-rw-r-- 1 jinfang jinfang 222K Dec 28 10:15 tf-2.out + -rw-rw-r-- 1 jinfang jinfang 1.7M Dec 28 10:17 tp.out + -rw-rw-r-- 1 jinfang jinfang 105M Dec 28 05:57 uniInput + + +Descriptions of Output Files: + + - ``PUL_blast.out``: BLAST results between CGCs and PULs. + - ``CGC.faa``: CGC Fasta sequences. + - ``cgc.gff``: reformatted from the user input gff file by marking CAZymes, TFs, TCs, and STPs. + - ``cgc.out``: raw output of CGC predictions. +Each entry in cgc.out includes: + + + 1. CGC_id: CGC1 + 2. type: CAZyme + 3. contig_id: contig_10157 + 4. gene_id: BML_MG_01992 + 5. start: 33003 + 6. end: 36077 + 7. strand: + + 8. annotation: GH2 + +Explanation: the gene BML_MG_01992 encodes a GH2 CAZyme in the CGC1 of the contig contig_10157. CGC1 also has other genes, which are provided in other rows. BML_MG_01992 is on the positive strand of contig_10157 from 33003 to 36077. The type can be one of the four signature gene types (CAZymes, TCs, TFs, STPs) or the null type (not annotated as one of the four signature genes). + +`cgc_standard.out.json`: JSON format of cgc_standard.out. +`dbcan-sub.hmm.out`: HMMER search result against dbCAN-sub HMMdb, including a column with CAZyme substrates extracted from fam-substrate-mapping-08012023.tsv. +`diamond.out`: DIAMOND search result against the CAZy annotated protein sequences (CAZyDB.07262023.fa). +`dtemp.out`: temporary file. +`hmmer.out`: HMMER search result against dbCAN HMMdb. +`overview.txt`: summary of CAZyme annotation from three methods in TSV format. An example row has the following columns: + 1. Gene_ID: BML_MG_01761 + 2. EC#: 2.4.99.-:5 + 3. dbCAN: GT112(19-370) + 4. dbCAN_sub: GT112_e0 + 5. DIAMOND: GT112 + 6. #ofTools: 3 +Explanation: the protein BML_MG_01761 is annotated by 3 tools to be a CAZyme: (1) GT112 (CAZy defined family GT112) by HMMER vs dbCAN HMMdb with a domain range from aa position 19 to 370, (2) GT112_e0 (eCAMI defined subfamily e0; e indicates it is from eCAMI not CAZy) by HMMER vs dbCAN-sub HMMdb (derived from eCAMI subfamilies), and (3) GT112 by DIAMOND vs CAZy annotated protein sequences. The second column 2.4.99.-:5 is extracted from eCAMI, meaning that the eCAMI subfamily GT112_e0 contains 5 member proteins which have an EC 2.4.99.- according to CAZy. In most cases, the 3 tools will have the same CAZyme family assignment. When they give different assignment. We recommend a preference order: dbCAN > eCAMI/dbCAN-sub > DIAMOND. See our dbCAN2 paper2, dbCAN3 paper3, and eCAMI4 for more details. +Note: If users invoked the --use_signalP parameter when running run_dbcan, there will be an additional column called signal in the overview.txt. +stp.out: HMMER search result against the MiST5 compiled signal transduction protein HMMs from Pfam. +tf-1.out: HMMER search result against the DBD6 compiled transcription factor HMMs from Pfam 7. +tf-2.out: HMMER search result against the DBD compiled transcription factor HMMs from Superfamily 8. +tp.out: DIAMOND search result against the TCDB 9 annotated protein sequences. +substrate.out: summary of substrate prediction results for CGCs in TSV format from two approaches3 (dbCAN-PUL blast search and dbCAN-sub majority voting). An example row has the following columns: + 1. CGC_ID: contig_10778|CGC2 + 2. Best hit PUL_ID in dbCAN-PUL: PUL0400 + 3. Substrate of the hit PUL: alginate + 4. Sum of bitscores for homologous gene pairs between CGC and PUL: 851.0 + 5. Types of homologous gene pairs: CAZyme-CAZyme;CAZyme-CAZyme;CAZyme-CAZyme;CAZyme-CAZyme + 6. Substrate predicted by majority voting of CAZymes in CGC: alginate + 7. Voting score: 2.0 +Explanation: The CGC2 of contig_10778 has its best hit PUL0400 (from PUL_blast.out) with alginate as substrate (from dbCAN-PUL_12-12-2023.xlsx). Four signature genes are matched between contig_10778|CGC2 and PUL0400 (from PUL_blast.out): all the four are CAZymes. The sum of blast bitscores of the 4 homologous pairs (CAZyme-CAZyme;CAZyme-CAZyme;CAZyme-CAZyme;CAZyme-CAZyme) is 851.0. Hence, the substrate of contig_10778|CGC2 is predicted to be alginate according to dbCAN-PUL blast search. The last two columns are based on the dbCAN-sub result (dbcan-sub.hmm.out), according to which two CAZymes in contig_10778|CGC2 are predicted to have alginate substrate. The voting score is thus 2.0, so that according to the majority voting rule, contig_10778|CGC2 is predicted to have an alginate substrate. +Note: for many CGCs, only one of the two approaches produces substrate prediction. In some cases, the two approaches produce different substrate assignments. We recommend a preference order: dbCAN-PUL blast search > dbCAN-sub majority voting. See our dbCAN3 paper3 for more details. +synteny.pdf: a folder with syntenic block alignment plots between all CGCs and PULs. +uniInput: renamed Fasta file from input protein sequence file. + + + + + + + + + + + + + diff --git a/pyproject.toml b/pyproject.toml index 65993e880..202e18461 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = ["hatchling"] [project] name = "dbcan" -version = "4.1.0" +version = "4.1.2" description = "Standalone version of dbCAN annotation tool for automated CAZyme annotation" readme = "README.md" requires-python = ">=3.6" @@ -25,6 +25,10 @@ dependencies = [ "scipy", "pandas", "biopython", + "tqdm", + "openpyxl", + "matplotlib", + "pyhmmer", # for debug logging (referenced from the issue template) "session-info" ] @@ -50,6 +54,7 @@ syntenic_plot = "dbcan.cli.syntenic_plot:main" dbcan_utils = "dbcan.utils.utils:main" dbcan_plot = "dbcan.utils.plots:main" dbcan_asmfree = "dbcan.utils.diamond_unassembly:main" +dbcan_build = "dbcan.utils.dbcan_build:main" [project.optional-dependencies] dev = [ @@ -131,7 +136,7 @@ ignore = [ # First line should be in imperative mood; try rephrasing "D401", ## Disable one in each pair of mutually incompatible rules - # We don’t want a blank line before a class docstring + # We don't want a blank line before a class docstring "D203", # We want docstrings to start immediately after the opening triple quote "D213",