From 5545f7598188ddfb2454ecbb381b4f405b231dcf Mon Sep 17 00:00:00 2001
From: Hajari Taheri <mkh6167@e5-cse-205-15.cse.psu.edu>
Date: Mon, 13 Nov 2023 19:48:13 -0500
Subject: [PATCH 01/76] conda release

---
 cli.py                              | 26 ++++++++++++++++++
 make_training_data_from_sketches.py | 34 +++++++++++++++--------
 meta.yaml                           | 38 ++++++++++++++++++++++++++
 run_YACHT.py                        | 42 ++++++++++++++++++-----------
 setup.py                            | 26 ++++++++++++++++++
 5 files changed, 139 insertions(+), 27 deletions(-)
 create mode 100644 cli.py
 create mode 100644 meta.yaml
 create mode 100644 setup.py

diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..f07f36c
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,26 @@
+import argparse
+import run_YACHT as run_YACHT
+import make_training_data_from_sketches as make_training_data
+
+def main():
+    parser = argparse.ArgumentParser(prog='yacht')
+    subparsers = parser.add_subparsers(dest='command')
+
+    # Run command
+    run_parser = subparsers.add_parser('run')
+    run_YACHT.add_arguments(run_parser)
+    run_parser.set_defaults(func=run_YACHT.main)
+
+    # Train command
+    train_parser = subparsers.add_parser('train')
+    make_training_data.add_arguments(train_parser)
+    train_parser.set_defaults(func=make_training_data.main)
+
+    args = parser.parse_args()
+    if 'func' in args:
+        args.func(args)
+    else:
+        parser.print_help()
+
+if __name__ == '__main__':
+    main()
diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
index 038d091..6da6a8c 100644
--- a/make_training_data_from_sketches.py
+++ b/make_training_data_from_sketches.py
@@ -9,26 +9,26 @@
 from loguru import logger
 import json
 import shutil
+
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="This script converts a collection of signature files into a reference database matrix.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+def add_arguments(parser):
     parser.add_argument('--ref_file', help='Location of the Sourmash signature database file. '
                                            'This is expected to be in Zipfile format (eg. *.zip)'
                                            'that contains a manifest "SOURMASH-MANIFEST.csv" and a folder "signatures"'
                                            'with all Gzip-format signature file (eg. *.sig.gz) ', required=True)
     parser.add_argument('--ksize', type=int, help='Size of kmers in sketch since Zipfiles', required=True)
-    parser.add_argument('--num_threads', type=int, help='Number of threads to use for parallelization.', required=False, default=16)
+    parser.add_argument('--num_threads', type=int, help='Number of threads to use for parallelization.', required=False,
+                        default=16)
     parser.add_argument('--ani_thresh', type=float, help='mutation cutoff for species equivalence.',
                         required=False, default=0.95)
     parser.add_argument('--prefix', help='Prefix for this experiment.', required=False, default='yacht')
     parser.add_argument('--outdir', type=str, help='path to output directory', required=False, default=os.getcwd())
     parser.add_argument('--force', action='store_true', help='Overwrite the output directory if it exists')
-    args = parser.parse_args()
 
+
+def main(args):
     # get the arguments
     ref_file = str(Path(args.ref_file).absolute())
     ksize = args.ksize
@@ -41,21 +41,24 @@
     # make sure reference database file exist and valid
     logger.info("Checking reference database file")
     if os.path.splitext(ref_file)[1] != '.zip':
-        raise ValueError(f"Reference database file {ref_file} is not a zip file. Please a Sourmash signature database file with Zipfile format.")
-    utils.check_file_existence(str(Path(ref_file).absolute()), f'Reference database zip file {ref_file} does not exist.')
+        raise ValueError(
+            f"Reference database file {ref_file} is not a zip file. Please a Sourmash signature database file with Zipfile format.")
+    utils.check_file_existence(str(Path(ref_file).absolute()),
+                               f'Reference database zip file {ref_file} does not exist.')
 
     # Create a temporary directory with time info as label
     logger.info("Creating a temporary directory")
-    path_to_temp_dir = os.path.join(outdir, prefix+'_intermediate_files')
+    path_to_temp_dir = os.path.join(outdir, prefix + '_intermediate_files')
     if os.path.exists(path_to_temp_dir) and not force:
-        raise ValueError(f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.")
+        raise ValueError(
+            f"Temporary directory {path_to_temp_dir} already exists. Please remove it or given a new prefix name using parameter '--prefix'.")
     else:
         # remove the temporary directory if it exists
         if os.path.exists(path_to_temp_dir):
             logger.warning(f"Temporary directory {path_to_temp_dir} already exists. Removing it.")
             shutil.rmtree(path_to_temp_dir)
     os.makedirs(path_to_temp_dir, exist_ok=True)
-    
+
     # unzip the sourmash signature file to the temporary directory
     logger.info("Unzipping the sourmash signature file to the temporary directory")
     with zipfile.ZipFile(ref_file, 'r') as sourmash_db:
@@ -104,3 +107,12 @@
                'scale': scale,
                'ksize': ksize,
                'ani_thresh': ani_thresh}, open(json_file_path, 'w'), indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This script converts a collection of signature files into a reference database matrix.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    add_arguments(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/meta.yaml b/meta.yaml
new file mode 100644
index 0000000..1b27964
--- /dev/null
+++ b/meta.yaml
@@ -0,0 +1,38 @@
+package:
+  name: yacht
+  version: 1.0
+
+source:
+  path: .
+
+build:
+  noarch: python
+  script: python -m pip install . --no-deps --ignore-installed --no-cache-dir
+
+requirements:
+  host:
+    - python >3.6
+    - pip
+  run:
+    - python >3.6
+    - sourmash >=4.8.3,<5
+    - rust
+    - scipy
+    - numpy
+    - pandas
+    - scikit-learn
+    - loguru
+    - maturin >=1,<2
+    - tqdm
+    - biom-format
+    - pytaxonkit
+    - openpyxl
+
+test:
+  imports:
+    - yacht
+
+about:
+  home: https://github.com/KoslickiLab/YACHT
+  license: MIT License
+  summary: YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).
diff --git a/run_YACHT.py b/run_YACHT.py
index fe7d99a..f0b2045 100644
--- a/run_YACHT.py
+++ b/run_YACHT.py
@@ -17,26 +17,26 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="This script estimates the abundance of microorganisms from a "
-                    "reference database matrix and metagenomic sample.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--json', type=str, help='Path to a json file generated by make_training_data_from_sketches.py.', required=True)
+def add_arguments(parser):
+    parser.add_argument('--json', type=str,
+                        help='Path to a json file generated by make_training_data_from_sketches.py.', required=True)
     parser.add_argument('--sample_file', help='Metagenomic sample in .sig.zip format', required=True)
     parser.add_argument('--significance', type=float, help='Minimum probability of individual true negative.',
                         required=False, default=0.99)
-    parser.add_argument('--num_threads', type=int, help='Number of threads to use for parallelization.', required=False, default=16)
+    parser.add_argument('--num_threads', type=int, help='Number of threads to use for parallelization.', required=False,
+                        default=16)
     parser.add_argument('--keep_raw', action='store_true', help='Keep raw results in output file.')
-    parser.add_argument('--show_all', action='store_true', help='Show all organisms (no matter if present) in output file.')
-    parser.add_argument('--min_coverage_list', nargs="+", type=float, help='A list of percentages of unique k-mers covered by reads in the sample. '
-                                                           'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
-                                                           'precise) and 1 being the most precise (and least sensitive).', 
-                                                           required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
-    parser.add_argument('--out', type=str, help='path to output excel file', required=False, default=os.path.join(os.getcwd(), 'result.xlsx'))
-
-    # parse the arguments
-    args = parser.parse_args()
+    parser.add_argument('--show_all', action='store_true',
+                        help='Show all organisms (no matter if present) in output file.')
+    parser.add_argument('--min_coverage_list', nargs="+", type=float,
+                        help='A list of percentages of unique k-mers covered by reads in the sample. '
+                             'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
+                             'precise) and 1 being the most precise (and least sensitive).',
+                        required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
+    parser.add_argument('--out', type=str, help='path to output excel file', required=False,
+                        default=os.path.join(os.getcwd(), 'result.xlsx'))
+
+def main(args):
     json_file_path = str(Path(args.json).absolute()) # path to json file
     sample_file = str(Path(args.sample_file).absolute()) # location of sample.sig file
     significance = args.significance  # Minimum probability of individual true negative.
@@ -148,3 +148,13 @@
             if not show_all:
                 temp_mainifest = temp_mainifest[temp_mainifest['in_sample_est'] == True]
             temp_mainifest.to_excel(writer, sheet_name=f'min_coverage{min_coverage}', index=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This script estimates the abundance of microorganisms from a "
+                    "reference database matrix and metagenomic sample.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    add_arguments(parser)
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..4b0db16
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='yacht',
+    version='1.0',
+    author='Koslicki, D., White, S., Ma, C., & Novikov, A.',
+    description='YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).',
+    packages=find_packages(),
+    install_requires=[
+        'sourmash>=4.8.3,<5',
+        'scipy',
+        'numpy',
+        'pandas',
+        'scikit-learn',
+        'loguru',
+        'tqdm',
+        'biom-format',
+        'pytaxonkit',
+        'openpyxl'
+    ],
+    entry_points={
+        'console_scripts': [
+            'yacht = cli:main',
+        ],
+    },
+)

From 8174420dd49f5ddd4303bbb39284c26801e9c243 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Mon, 13 Nov 2023 20:57:59 -0500
Subject: [PATCH 02/76] remove test for now

---
 meta.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/meta.yaml b/meta.yaml
index 1b27964..73b9366 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -28,10 +28,6 @@ requirements:
     - pytaxonkit
     - openpyxl
 
-test:
-  imports:
-    - yacht
-
 about:
   home: https://github.com/KoslickiLab/YACHT
   license: MIT License

From f7946beebb8d8def456dc7dad774d12db2db7dd0 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Mon, 13 Nov 2023 21:13:02 -0500
Subject: [PATCH 03/76] cli not found error

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4b0db16..3299003 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
     ],
     entry_points={
         'console_scripts': [
-            'yacht = cli:main',
+            'yacht = yacht.cli:main',
         ],
     },
 )

From 423e923cb1f6f5e6b44653a1580a528af5ea53d9 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Mon, 13 Nov 2023 22:15:09 -0500
Subject: [PATCH 04/76] cli not found error

---
 setup.py                                                        | 2 +-
 cli.py => srcs/cli.py                                           | 0
 .../make_training_data_from_sketches.py                         | 0
 run_YACHT.py => srcs/run_YACHT.py                               | 0
 4 files changed, 1 insertion(+), 1 deletion(-)
 rename cli.py => srcs/cli.py (100%)
 rename make_training_data_from_sketches.py => srcs/make_training_data_from_sketches.py (100%)
 rename run_YACHT.py => srcs/run_YACHT.py (100%)

diff --git a/setup.py b/setup.py
index 3299003..44dd1df 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
     ],
     entry_points={
         'console_scripts': [
-            'yacht = yacht.cli:main',
+            'yacht = srcs.cli:main',
         ],
     },
 )
diff --git a/cli.py b/srcs/cli.py
similarity index 100%
rename from cli.py
rename to srcs/cli.py
diff --git a/make_training_data_from_sketches.py b/srcs/make_training_data_from_sketches.py
similarity index 100%
rename from make_training_data_from_sketches.py
rename to srcs/make_training_data_from_sketches.py
diff --git a/run_YACHT.py b/srcs/run_YACHT.py
similarity index 100%
rename from run_YACHT.py
rename to srcs/run_YACHT.py

From 5c00a9c58d2b42adb23b234d58fb5358302418f5 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Tue, 14 Nov 2023 10:48:07 -0500
Subject: [PATCH 05/76] module error

---
 srcs/cli.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/srcs/cli.py b/srcs/cli.py
index f07f36c..2563813 100644
--- a/srcs/cli.py
+++ b/srcs/cli.py
@@ -1,8 +1,9 @@
 import argparse
-import run_YACHT as run_YACHT
-import make_training_data_from_sketches as make_training_data
 
-def main():
+from srcs import run_YACHT
+
+
+def main(make_training_data=None):
     parser = argparse.ArgumentParser(prog='yacht')
     subparsers = parser.add_subparsers(dest='command')
 

From 1c4062f5cced98d15f96cddf4313f66fce77fa2b Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Tue, 14 Nov 2023 11:18:20 -0500
Subject: [PATCH 06/76] make_training_data_from_sketches import error

---
 srcs/cli.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/srcs/cli.py b/srcs/cli.py
index 2563813..6948d05 100644
--- a/srcs/cli.py
+++ b/srcs/cli.py
@@ -1,9 +1,10 @@
 import argparse
 
 from srcs import run_YACHT
+from srcs import make_training_data_from_sketches
 
 
-def main(make_training_data=None):
+def main():
     parser = argparse.ArgumentParser(prog='yacht')
     subparsers = parser.add_subparsers(dest='command')
 
@@ -14,7 +15,7 @@ def main(make_training_data=None):
 
     # Train command
     train_parser = subparsers.add_parser('train')
-    make_training_data.add_arguments(train_parser)
+    make_training_data_from_sketches.add_arguments(train_parser)
     train_parser.set_defaults(func=make_training_data.main)
 
     args = parser.parse_args()

From b0bb96a872db89e95179b1d5de56635e90c4e834 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Tue, 14 Nov 2023 11:53:10 -0500
Subject: [PATCH 07/76] make_training_data_from_sketches import error

---
 srcs/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srcs/cli.py b/srcs/cli.py
index 6948d05..e7d00f3 100644
--- a/srcs/cli.py
+++ b/srcs/cli.py
@@ -16,7 +16,7 @@ def main():
     # Train command
     train_parser = subparsers.add_parser('train')
     make_training_data_from_sketches.add_arguments(train_parser)
-    train_parser.set_defaults(func=make_training_data.main)
+    train_parser.set_defaults(func=make_training_data_from_sketches.main)
 
     args = parser.parse_args()
     if 'func' in args:

From 23208cda0331dc7a99065309ee3473ec3671c7d9 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Tue, 14 Nov 2023 13:37:55 -0500
Subject: [PATCH 08/76] add convert functionality to commands

---
 srcs/cli.py                      |   6 ++
 srcs/standardize_yacht_output.py | 126 +++++++++++++++++--------------
 2 files changed, 76 insertions(+), 56 deletions(-)

diff --git a/srcs/cli.py b/srcs/cli.py
index e7d00f3..c3fadd4 100644
--- a/srcs/cli.py
+++ b/srcs/cli.py
@@ -2,6 +2,7 @@
 
 from srcs import run_YACHT
 from srcs import make_training_data_from_sketches
+from srcs import standardize_yacht_output
 
 
 def main():
@@ -18,6 +19,11 @@ def main():
     make_training_data_from_sketches.add_arguments(train_parser)
     train_parser.set_defaults(func=make_training_data_from_sketches.main)
 
+    # Convert command
+    convert_parser = subparsers.add_parser('convert')
+    standardize_yacht_output.add_arguments(convert_parser)
+    convert_parser.set_defaults(func=standardize_yacht_output.main)
+
     args = parser.parse_args()
     if 'func' in args:
         args.func(args)
diff --git a/srcs/standardize_yacht_output.py b/srcs/standardize_yacht_output.py
index 21124ce..28f0e6c 100644
--- a/srcs/standardize_yacht_output.py
+++ b/srcs/standardize_yacht_output.py
@@ -16,6 +16,73 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
+def add_arguments(parser):
+    parser.add_argument('--yacht_output', type=str, help='Path to the YACHT output excel file.', required=True)
+    parser.add_argument('--sheet_name', type=str, help='The sheet name of the YACHT output excel file.', required=True)
+    parser.add_argument('--genome_to_taxid', type=str, help='Path to the genome to taxid file. This file is a TSV file \
+                            with two columns: genome ID (genome_id) and its corresponding taxid (taxid).',
+                        required=True)
+    parser.add_argument('--mode', type=str,
+                        help='The output format. Options: cami, biom, graphplan, all. Default: cami.', required=False,
+                        default='cami')
+    parser.add_argument('--sample_name', type=str,
+                        help='The sample name shown in header of the file. Default: Sample1.', required=False,
+                        default='Sample1')
+    parser.add_argument('--outfile_prefix', type=str, help='The prefix of the output file. Default: result.',
+                        required=False, default='result')
+    parser.add_argument('--outdir', type=str, help='The path to the output directory.', required=True)
+
+def main(args):
+    yacht_output = args.yacht_output
+    sheet_name = args.sheet_name
+    genome_to_taxid = args.genome_to_taxid
+    mode = args.mode
+    sample_name = args.sample_name
+    outfile_prefix = args.outfile_prefix
+    outdir = args.outdir
+
+    # check if the yacht output file exists
+    if not os.path.exists(yacht_output):
+        logger.error(f"{yacht_output} does not exist.")
+        raise ValueError
+
+    # check if the genome to taxid file exists
+    if not os.path.exists(genome_to_taxid):
+        logger.error(f"{genome_to_taxid} does not exist.")
+        raise ValueError
+
+    # check if the output directory exists and create it if not
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    # load the yacht output
+    yacht_output_df = pd.read_excel(yacht_output, sheet_name=sheet_name, engine='openpyxl')
+    # converet the first column to string
+    yacht_output_df['organism_name'] = yacht_output_df['organism_name'].astype(str)
+
+    # load the genome to taxid file
+    genome_to_taxid_df = pd.read_csv(genome_to_taxid, sep='\t', header=0)
+    # converet the first column to string
+    genome_to_taxid_df['genome_id'] = genome_to_taxid_df['genome_id'].astype(str)
+
+    # run the standardization
+    standardize_yacht_output = StandardizeYachtOutput()
+    if mode == 'all':
+        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'cami', sample_name)
+        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'biom', sample_name)
+        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'graphplan',
+                                     sample_name)
+    elif mode == 'cami':
+        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'cami', sample_name)
+    elif mode == 'biom':
+        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'biom', sample_name)
+    elif mode == 'graphplan':
+        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'graphplan',
+                                     sample_name)
+    else:
+        logger.error(f"{mode} is not a valid output format. Please choose from cami, biom, graphplan, all.")
+        exit(1)
+
 class StandardizeYachtOutput:
     """
     Standardize the output of YACHT to a format (options: CAMI, BIOM, GraphPlAn)
@@ -268,61 +335,8 @@ def run(self, yacht_output, genome_to_taxid, path_to_outdir, fileprefix='result'
     parser = argparse.ArgumentParser(
         description="This script convert YACHT output to a format (options: CAMI, BIOM, GraphPlAn).",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--yacht_output', type=str, help='Path to the YACHT output excel file.', required=True)
-    parser.add_argument('--sheet_name', type=str, help='The sheet name of the YACHT output excel file.', required=True)
-    parser.add_argument('--genome_to_taxid', type=str, help='Path to the genome to taxid file. This file is a TSV file \
-                        with two columns: genome ID (genome_id) and its corresponding taxid (taxid).', required=True)
-    parser.add_argument('--mode', type=str, help='The output format. Options: cami, biom, graphplan, all. Default: cami.', required=False, default='cami')
-    parser.add_argument('--sample_name', type=str, help='The sample name shown in header of the file. Default: Sample1.', required=False, default='Sample1')
-    parser.add_argument('--outfile_prefix', type=str, help='The prefix of the output file. Default: result.', required=False, default='result')
-    parser.add_argument('--outdir', type=str, help='The path to the output directory.', required=True)
-    
+
+    add_arguments(parser)
     # parse the arguments
     args = parser.parse_args()
-    yacht_output = args.yacht_output
-    sheet_name = args.sheet_name
-    genome_to_taxid = args.genome_to_taxid
-    mode = args.mode
-    sample_name = args.sample_name
-    outfile_prefix = args.outfile_prefix
-    outdir = args.outdir
-    
-    # check if the yacht output file exists
-    if not os.path.exists(yacht_output):
-        logger.error(f"{yacht_output} does not exist.")
-        raise ValueError
-    
-    # check if the genome to taxid file exists
-    if not os.path.exists(genome_to_taxid):
-        logger.error(f"{genome_to_taxid} does not exist.")
-        raise ValueError
-    
-    # check if the output directory exists and create it if not
-    if not os.path.exists(outdir):
-        os.makedirs(outdir)
-        
-    # load the yacht output
-    yacht_output_df = pd.read_excel(yacht_output, sheet_name=sheet_name, engine='openpyxl')
-    # converet the first column to string
-    yacht_output_df['organism_name'] = yacht_output_df['organism_name'].astype(str)
-    
-    # load the genome to taxid file
-    genome_to_taxid_df = pd.read_csv(genome_to_taxid, sep='\t', header=0)
-    # converet the first column to string
-    genome_to_taxid_df['genome_id'] = genome_to_taxid_df['genome_id'].astype(str)
-    
-    # run the standardization
-    standardize_yacht_output = StandardizeYachtOutput()
-    if mode == 'all':
-        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'cami', sample_name)
-        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'biom', sample_name)
-        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'graphplan', sample_name)
-    elif mode == 'cami':
-        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'cami', sample_name)
-    elif mode == 'biom':
-        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'biom', sample_name)
-    elif mode == 'graphplan':
-        standardize_yacht_output.run(yacht_output_df, genome_to_taxid_df, outdir, outfile_prefix, 'graphplan', sample_name)
-    else:
-        logger.error(f"{mode} is not a valid output format. Please choose from cami, biom, graphplan, all.")
-        exit(1)
+    main(args)

From 4d4e2ddff94fd39bfda706d83f1108cceb2460c0 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Tue, 14 Nov 2023 17:34:23 -0500
Subject: [PATCH 09/76] test error due to moving scripts to srcs

---
 .github/workflows/runTest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 627d058..5ccb9b7 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -13,9 +13,9 @@ jobs:
         activate-environment: yacht_env
         environment-file: env/yacht_env.yml
     - name: make training data
-      run: python make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
+      run: python srcs/make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT
-      run: python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
+      run: python srcs/run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py --cov=./  --cov-report xml:covunit.xml
     - name: test-utils

From a916b1119e391527475e467dea8c8a9b9fb039e2 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Tue, 14 Nov 2023 18:09:00 -0500
Subject: [PATCH 10/76] test error due to moving scripts to srcs

---
 srcs/make_training_data_from_sketches.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/srcs/make_training_data_from_sketches.py b/srcs/make_training_data_from_sketches.py
index 6da6a8c..70430e6 100644
--- a/srcs/make_training_data_from_sketches.py
+++ b/srcs/make_training_data_from_sketches.py
@@ -5,11 +5,12 @@
 import zipfile
 from pathlib import Path
 import pandas as pd
-import srcs.utils as utils
 from loguru import logger
 import json
 import shutil
 
+from srcs import utils
+
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 

From 9c1bfac4b909c78168665367176da011d9afa21c Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Wed, 15 Nov 2023 00:24:05 -0500
Subject: [PATCH 11/76] fix the utils relative path error

---
 srcs/make_training_data_from_sketches.py | 2 +-
 srcs/run_YACHT.py                        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/srcs/make_training_data_from_sketches.py b/srcs/make_training_data_from_sketches.py
index 70430e6..780eb52 100644
--- a/srcs/make_training_data_from_sketches.py
+++ b/srcs/make_training_data_from_sketches.py
@@ -9,7 +9,7 @@
 import json
 import shutil
 
-from srcs import utils
+import utils
 
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
diff --git a/srcs/run_YACHT.py b/srcs/run_YACHT.py
index f0b2045..86c8827 100644
--- a/srcs/run_YACHT.py
+++ b/srcs/run_YACHT.py
@@ -3,10 +3,10 @@
 import numpy as np
 import pandas as pd
 from pathlib import Path
-import srcs.hypothesis_recovery_src as hr
+import hypothesis_recovery_src as hr
 from scipy.sparse import load_npz
 import argparse
-import srcs.utils as utils
+import utils
 import json
 import warnings
 import zipfile

From 8c7dd6501fa9a115976860794c5790f65fedb02b Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Wed, 15 Nov 2023 00:26:51 -0500
Subject: [PATCH 12/76] remove the unused packages

---
 srcs/hypothesis_recovery_src.py          | 4 ++--
 srcs/make_training_data_from_sketches.py | 2 --
 srcs/run_YACHT.py                        | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/srcs/hypothesis_recovery_src.py b/srcs/hypothesis_recovery_src.py
index 483c24e..5c51d42 100644
--- a/srcs/hypothesis_recovery_src.py
+++ b/srcs/hypothesis_recovery_src.py
@@ -7,11 +7,11 @@
 import zipfile
 from tqdm import tqdm, trange
 from .utils import load_signature_with_ksize
-import concurrent.futures as cf
-from multiprocessing import Pool, Manager
+from multiprocessing import Pool
 import sourmash
 from typing import Optional, Union, List, Set, Dict, Tuple
 warnings.filterwarnings("ignore")
+
 from loguru import logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
diff --git a/srcs/make_training_data_from_sketches.py b/srcs/make_training_data_from_sketches.py
index 780eb52..4b48f62 100644
--- a/srcs/make_training_data_from_sketches.py
+++ b/srcs/make_training_data_from_sketches.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python
 import os, sys
-import sourmash
 import argparse
 import zipfile
 from pathlib import Path
-import pandas as pd
 from loguru import logger
 import json
 import shutil
diff --git a/srcs/run_YACHT.py b/srcs/run_YACHT.py
index 86c8827..6f651be 100644
--- a/srcs/run_YACHT.py
+++ b/srcs/run_YACHT.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python
 import os, sys
-import numpy as np
 import pandas as pd
 from pathlib import Path
 import hypothesis_recovery_src as hr
-from scipy.sparse import load_npz
 import argparse
 import utils
 import json
@@ -12,7 +10,7 @@
 import zipfile
 from pathlib import Path
 warnings.filterwarnings("ignore")
-from tqdm import tqdm
+
 from loguru import logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")

From 84ad5a21bc3dfa217519275a6e1d081b5c7a0839 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Wed, 15 Nov 2023 00:46:00 -0500
Subject: [PATCH 13/76] modify README for the new locations of scripts

---
 README.md                       | 8 ++++----
 srcs/hypothesis_recovery_src.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index cde1d6b..4a214d5 100644
--- a/README.md
+++ b/README.md
@@ -26,10 +26,10 @@ sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/qu
 sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists
 
 # preprocess the reference genomes (training step)
-python ../make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
+python ../srcs/make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
 
 # run YACHT algorithm to check the presence of reference genomes in the query sample (inference step)
-python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
+python ../srcs/run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
 
 # convert result to CAMI profile format (Optional)
 python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
@@ -173,7 +173,7 @@ In our benchmark with `GTDB representive genomes`, it takes `15 minutes` using `
 The script `make_training_data_from_sketches.py` extracts the sketches from the Zipfile-format reference database, and then turns them into a form usable by YACHT. In particular, it removes one of any two organisms that have ANI greater than the user-specified threshold as these two organisms are too close to be "distinguishable".
 
 ```bash 
-python make_training_data_from_sketches.py --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
+python srcs/make_training_data_from_sketches.py --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
 ```
 
 #### Parameter
@@ -205,7 +205,7 @@ The most important parameter of this script is `--ani_thresh`: this is average n
 After this, you are ready to perform the hypothesis test for each organism in your reference database. This can be accomplished with something like:
 
 ```bash
-python run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
+python srcs/run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
 ```
 
 #### Parameter
diff --git a/srcs/hypothesis_recovery_src.py b/srcs/hypothesis_recovery_src.py
index 5c51d42..2fef0e2 100644
--- a/srcs/hypothesis_recovery_src.py
+++ b/srcs/hypothesis_recovery_src.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import zipfile
 from tqdm import tqdm, trange
-from .utils import load_signature_with_ksize
+from utils import load_signature_with_ksize
 from multiprocessing import Pool
 import sourmash
 from typing import Optional, Union, List, Set, Dict, Tuple

From 3ee99f76dc980c2483ec0192c2e2f89f545ebc10 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Wed, 15 Nov 2023 16:23:28 -0500
Subject: [PATCH 14/76] imports from module

---
 srcs/hypothesis_recovery_src.py          | 5 ++++-
 srcs/make_training_data_from_sketches.py | 2 +-
 srcs/run_YACHT.py                        | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/srcs/hypothesis_recovery_src.py b/srcs/hypothesis_recovery_src.py
index 2fef0e2..995b666 100644
--- a/srcs/hypothesis_recovery_src.py
+++ b/srcs/hypothesis_recovery_src.py
@@ -6,10 +6,13 @@
 import pandas as pd
 import zipfile
 from tqdm import tqdm, trange
-from utils import load_signature_with_ksize
+
 from multiprocessing import Pool
 import sourmash
 from typing import Optional, Union, List, Set, Dict, Tuple
+
+from srcs.utils import load_signature_with_ksize
+
 warnings.filterwarnings("ignore")
 
 from loguru import logger
diff --git a/srcs/make_training_data_from_sketches.py b/srcs/make_training_data_from_sketches.py
index 4b48f62..2808afe 100644
--- a/srcs/make_training_data_from_sketches.py
+++ b/srcs/make_training_data_from_sketches.py
@@ -7,7 +7,7 @@
 import json
 import shutil
 
-import utils
+from srcs import utils
 
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
diff --git a/srcs/run_YACHT.py b/srcs/run_YACHT.py
index 6f651be..9114963 100644
--- a/srcs/run_YACHT.py
+++ b/srcs/run_YACHT.py
@@ -2,9 +2,9 @@
 import os, sys
 import pandas as pd
 from pathlib import Path
-import hypothesis_recovery_src as hr
+import srcs.hypothesis_recovery_src as hr
 import argparse
-import utils
+import srcs.utils
 import json
 import warnings
 import zipfile

From 7bd22c2c2a3af4d761e28478907be24318e8d654 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Wed, 15 Nov 2023 16:37:46 -0500
Subject: [PATCH 15/76] imports from module

---
 srcs/standardize_yacht_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/srcs/standardize_yacht_output.py b/srcs/standardize_yacht_output.py
index 28f0e6c..1657d50 100644
--- a/srcs/standardize_yacht_output.py
+++ b/srcs/standardize_yacht_output.py
@@ -10,7 +10,7 @@
 import biom
 import argparse
 from biom.util import biom_open
-from utils import get_cami_profile
+from srcs.utils import get_cami_profile
 from collections import OrderedDict
 from loguru import logger
 logger.remove()

From 13627c895f78b380f68ce8a8b5e903f2d51ed0f4 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Wed, 15 Nov 2023 17:23:51 -0500
Subject: [PATCH 16/76] adding pyo3-branchwater

---
 meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/meta.yaml b/meta.yaml
index 73b9366..da98a56 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -27,6 +27,7 @@ requirements:
     - biom-format
     - pytaxonkit
     - openpyxl
+    - pyo3-branchwater
 
 about:
   home: https://github.com/KoslickiLab/YACHT

From 839b941bd53384eb370807f890a8ed925ed26b82 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 18:07:36 -0500
Subject: [PATCH 17/76] test errors

---
 .github/workflows/runTest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 5ccb9b7..1a5daaa 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -13,9 +13,9 @@ jobs:
         activate-environment: yacht_env
         environment-file: env/yacht_env.yml
     - name: make training data
-      run: python srcs/make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
+      run: python srcs.make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT
-      run: python srcs/run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
+      run: python srcs.run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py --cov=./  --cov-report xml:covunit.xml
     - name: test-utils

From 92b837121256b143eb1c45d8a9cd515636167d46 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 18:37:11 -0500
Subject: [PATCH 18/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 1a5daaa..00c49c4 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -13,9 +13,9 @@ jobs:
         activate-environment: yacht_env
         environment-file: env/yacht_env.yml
     - name: make training data
-      run: python srcs.make_training_data_from_sketches.py --ref_file 'tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
+      run: python srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
     - name: run YACHT
-      run: python srcs.run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
+      run: python srcs.run_YACHT.py --json ../gtdb_ani_thresh_0.95_config.json --sample_file '../tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py --cov=./  --cov-report xml:covunit.xml
     - name: test-utils

From 45b2983c4d7ab9926ce3e95cd6fb9c6fb95e78f0 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 16 Nov 2023 19:06:36 -0500
Subject: [PATCH 19/76] rename "srcs" to "src" and copy content in cli.py into
 __init__.py

---
 srcs/cli.py => src/__init__.py                    | 8 ++++----
 {srcs => src}/hypothesis_recovery_src.py          | 2 +-
 {srcs => src}/make_training_data_from_sketches.py | 3 +--
 {srcs => src}/run_YACHT.py                        | 4 ++--
 {srcs => src}/standardize_yacht_output.py         | 2 +-
 {srcs => src}/utils.py                            | 0
 srcs/__init__.py                                  | 0
 7 files changed, 9 insertions(+), 10 deletions(-)
 rename srcs/cli.py => src/__init__.py (86%)
 rename {srcs => src}/hypothesis_recovery_src.py (99%)
 rename {srcs => src}/make_training_data_from_sketches.py (99%)
 rename {srcs => src}/run_YACHT.py (99%)
 rename {srcs => src}/standardize_yacht_output.py (99%)
 rename {srcs => src}/utils.py (100%)
 delete mode 100644 srcs/__init__.py

diff --git a/srcs/cli.py b/src/__init__.py
similarity index 86%
rename from srcs/cli.py
rename to src/__init__.py
index c3fadd4..50125ec 100644
--- a/srcs/cli.py
+++ b/src/__init__.py
@@ -1,8 +1,8 @@
 import argparse
 
-from srcs import run_YACHT
-from srcs import make_training_data_from_sketches
-from srcs import standardize_yacht_output
+from . import run_YACHT
+from . import make_training_data_from_sketches
+from . import standardize_yacht_output
 
 
 def main():
@@ -31,4 +31,4 @@ def main():
         parser.print_help()
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/srcs/hypothesis_recovery_src.py b/src/hypothesis_recovery_src.py
similarity index 99%
rename from srcs/hypothesis_recovery_src.py
rename to src/hypothesis_recovery_src.py
index 995b666..d8ddee9 100644
--- a/srcs/hypothesis_recovery_src.py
+++ b/src/hypothesis_recovery_src.py
@@ -11,7 +11,7 @@
 import sourmash
 from typing import Optional, Union, List, Set, Dict, Tuple
 
-from srcs.utils import load_signature_with_ksize
+from utils import load_signature_with_ksize
 
 warnings.filterwarnings("ignore")
 
diff --git a/srcs/make_training_data_from_sketches.py b/src/make_training_data_from_sketches.py
similarity index 99%
rename from srcs/make_training_data_from_sketches.py
rename to src/make_training_data_from_sketches.py
index 2808afe..10a7085 100644
--- a/srcs/make_training_data_from_sketches.py
+++ b/src/make_training_data_from_sketches.py
@@ -6,8 +6,7 @@
 from loguru import logger
 import json
 import shutil
-
-from srcs import utils
+import utils
 
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
diff --git a/srcs/run_YACHT.py b/src/run_YACHT.py
similarity index 99%
rename from srcs/run_YACHT.py
rename to src/run_YACHT.py
index 9114963..6f651be 100644
--- a/srcs/run_YACHT.py
+++ b/src/run_YACHT.py
@@ -2,9 +2,9 @@
 import os, sys
 import pandas as pd
 from pathlib import Path
-import srcs.hypothesis_recovery_src as hr
+import hypothesis_recovery_src as hr
 import argparse
-import srcs.utils
+import utils
 import json
 import warnings
 import zipfile
diff --git a/srcs/standardize_yacht_output.py b/src/standardize_yacht_output.py
similarity index 99%
rename from srcs/standardize_yacht_output.py
rename to src/standardize_yacht_output.py
index 1657d50..28f0e6c 100644
--- a/srcs/standardize_yacht_output.py
+++ b/src/standardize_yacht_output.py
@@ -10,7 +10,7 @@
 import biom
 import argparse
 from biom.util import biom_open
-from srcs.utils import get_cami_profile
+from utils import get_cami_profile
 from collections import OrderedDict
 from loguru import logger
 logger.remove()
diff --git a/srcs/utils.py b/src/utils.py
similarity index 100%
rename from srcs/utils.py
rename to src/utils.py
diff --git a/srcs/__init__.py b/srcs/__init__.py
deleted file mode 100644
index e69de29..0000000

From 305765e73970bcfe1489083cc1c8d5547ea07586 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 16 Nov 2023 19:08:14 -0500
Subject: [PATCH 20/76] add pip and change the installation of branchwater from
 github to pip

---
 env/yacht_env.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/env/yacht_env.yml b/env/yacht_env.yml
index 75ebbfc..f2ed57e 100644
--- a/env/yacht_env.yml
+++ b/env/yacht_env.yml
@@ -1,7 +1,7 @@
 name: yacht_env
 channels:
-  - bioconda
   - conda-forge
+  - bioconda
   - defaults
 dependencies:
   - python>3.6
@@ -19,6 +19,7 @@ dependencies:
   - tqdm
   - biom-format
   - pytaxonkit
+  - pip
   - pip:
     - openpyxl
-    - git+https://github.com/sourmash-bio/pyo3_branchwater@v0.8.1
+    - pyo3_branchwater==0.8.1

From 6430eeb44cc68237a58fed3225cdcae3258867a3 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 16 Nov 2023 19:08:32 -0500
Subject: [PATCH 21/76] update ../README.md

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 4a214d5..6c2e166 100644
--- a/README.md
+++ b/README.md
@@ -26,13 +26,13 @@ sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/qu
 sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists
 
 # preprocess the reference genomes (training step)
-python ../srcs/make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
+python ../src/make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
 
 # run YACHT algorithm to check the presence of reference genomes in the query sample (inference step)
-python ../srcs/run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
+python ../src/run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
 
 # convert result to CAMI profile format (Optional)
-python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
+python ../src/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
 ```
 
 There will be an output EXCEL file `result.xlsx` recoding the presence of reference genomes with different spreadsheets given the minimum coverage of `1 0.6 0.2 0.1`.
@@ -173,7 +173,7 @@ In our benchmark with `GTDB representive genomes`, it takes `15 minutes` using `
 The script `make_training_data_from_sketches.py` extracts the sketches from the Zipfile-format reference database, and then turns them into a form usable by YACHT. In particular, it removes one of any two organisms that have ANI greater than the user-specified threshold as these two organisms are too close to be "distinguishable".
 
 ```bash 
-python srcs/make_training_data_from_sketches.py --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
+python src/make_training_data_from_sketches.py --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
 ```
 
 #### Parameter
@@ -205,7 +205,7 @@ The most important parameter of this script is `--ani_thresh`: this is average n
 After this, you are ready to perform the hypothesis test for each organism in your reference database. This can be accomplished with something like:
 
 ```bash
-python srcs/run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
+python src/run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
 ```
 
 #### Parameter
@@ -243,13 +243,13 @@ Other interesting columns include:
 
 ### Convert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn)
 
-When we get the EXCEL result file from run_YACHT.py, you can run `standardize_yacht_output.py` under `srcs` folder to covert the YACHT result to other popular output formats (Currently, only `cami`, `biom`, `graphplan` are supported).
+When we get the EXCEL result file from run_YACHT.py, you can run `standardize_yacht_output.py` under `src` folder to covert the YACHT result to other popular output formats (Currently, only `cami`, `biom`, `graphplan` are supported).
 
-__Note__: Before you run `srcs/standardize_yacht_output.py`, you need to prepare a TSV file `genome_to_taxid.tsv` containing two columns: genome ID (genome_id) and its corresponding taxid (taxid). An example can be found [here](demo/toy_genome_to_taxid.tsv). You need to prepare it according to the reference database genomes you used. 
+__Note__: Before you run `src/standardize_yacht_output.py`, you need to prepare a TSV file `genome_to_taxid.tsv` containing two columns: genome ID (genome_id) and its corresponding taxid (taxid). An example can be found [here](demo/toy_genome_to_taxid.tsv). You need to prepare it according to the reference database genomes you used. 
 
 Then you are ready to run `standardize_yacht_output.py` with something like:
 ```bash
-python srcs/standardize_yacht_output.py --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --genome_to_taxid 'genome_to_taxid.tsv' --mode 'cami' --sample_name 'MySample' --outfile_prefix 'cami_result' --outdir ./
+python src/standardize_yacht_output.py --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --genome_to_taxid 'genome_to_taxid.tsv' --mode 'cami' --sample_name 'MySample' --outfile_prefix 'cami_result' --outdir ./
 ```
 
 | Parameter         | Explanation                                                  |

From d75e5c4394f2ee25278160a023377d8134a1b518 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 16 Nov 2023 19:10:11 -0500
Subject: [PATCH 22/76] change the main entry to src:main

---
 meta.yaml | 9 +++++++--
 setup.py  | 9 ++++-----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/meta.yaml b/meta.yaml
index 73b9366..8ef518b 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -7,9 +7,12 @@ source:
 
 build:
   noarch: python
-  script: python -m pip install . --no-deps --ignore-installed --no-cache-dir
+  script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir -vvv"
 
 requirements:
+  build:
+    - python
+    - setuptools
   host:
     - python >3.6
     - pip
@@ -21,12 +24,14 @@ requirements:
     - numpy
     - pandas
     - scikit-learn
+    - codecov
+    - pytest
+    - pytest-cov
     - loguru
     - maturin >=1,<2
     - tqdm
     - biom-format
     - pytaxonkit
-    - openpyxl
 
 about:
   home: https://github.com/KoslickiLab/YACHT
diff --git a/setup.py b/setup.py
index 44dd1df..0984697 100644
--- a/setup.py
+++ b/setup.py
@@ -7,20 +7,19 @@
     description='YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).',
     packages=find_packages(),
     install_requires=[
-        'sourmash>=4.8.3,<5',
         'scipy',
         'numpy',
         'pandas',
         'scikit-learn',
         'loguru',
         'tqdm',
-        'biom-format',
-        'pytaxonkit',
-        'openpyxl'
+        'openpyxl',
+        'pyo3-branchwater==0.8.1'
     ],
     entry_points={
         'console_scripts': [
-            'yacht = srcs.cli:main',
+            'yacht = src:main',
         ],
     },
+    python_requires='>=3.6',
 )

From c40ad5456b426f01dde10543d2136a09c4b69611 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:15:33 -0500
Subject: [PATCH 23/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 00c49c4..402307a 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -7,13 +7,13 @@ jobs:
       run:
         shell: bash -el {0}
     steps:
-    - uses: actions/checkout@v4
-    - uses: conda-incubator/setup-miniconda@v2
-      with:
-        activate-environment: yacht_env
-        environment-file: env/yacht_env.yml
+#    - uses: actions/checkout@v4
+#    - uses: conda-incubator/setup-miniconda@v2
+#      with:
+#        activate-environment: yacht_env
+#        environment-file: env/yacht_env.yml
     - name: make training data
-      run: python srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
+      run: python srcs/srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
     - name: run YACHT
       run: python srcs.run_YACHT.py --json ../gtdb_ani_thresh_0.95_config.json --sample_file '../tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit

From 888b2604b08fe0275d99c9a048433dfa9395e89d Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:17:25 -0500
Subject: [PATCH 24/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 402307a..554ce8a 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -13,7 +13,7 @@ jobs:
 #        activate-environment: yacht_env
 #        environment-file: env/yacht_env.yml
     - name: make training data
-      run: python srcs/srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
+      run: python -m srcs/srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
     - name: run YACHT
       run: python srcs.run_YACHT.py --json ../gtdb_ani_thresh_0.95_config.json --sample_file '../tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit

From d1c3d885be080e1f8ed66f98fedcda41119a0875 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:18:09 -0500
Subject: [PATCH 25/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 554ce8a..ae9925a 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -13,7 +13,7 @@ jobs:
 #        activate-environment: yacht_env
 #        environment-file: env/yacht_env.yml
     - name: make training data
-      run: python -m srcs/srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
+      run: python -m srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
     - name: run YACHT
       run: python srcs.run_YACHT.py --json ../gtdb_ani_thresh_0.95_config.json --sample_file '../tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit

From 8b8ba0e7cacfff722510543b2faa68b9acc57f95 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:19:54 -0500
Subject: [PATCH 26/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index ae9925a..4a737f3 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -13,7 +13,7 @@ jobs:
 #        activate-environment: yacht_env
 #        environment-file: env/yacht_env.yml
     - name: make training data
-      run: python -m srcs.make_training_data_from_sketches.py --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
+      run: python -m srcs.make_training_data_from_sketches --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
     - name: run YACHT
       run: python srcs.run_YACHT.py --json ../gtdb_ani_thresh_0.95_config.json --sample_file '../tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit

From 23d5a0f0d3f8ce6ead2d9568c4f9d60461e32d8c Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:24:24 -0500
Subject: [PATCH 27/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 4a737f3..3fba674 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -7,15 +7,15 @@ jobs:
       run:
         shell: bash -el {0}
     steps:
-#    - uses: actions/checkout@v4
-#    - uses: conda-incubator/setup-miniconda@v2
-#      with:
-#        activate-environment: yacht_env
-#        environment-file: env/yacht_env.yml
+    - uses: actions/checkout@v4
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: yacht_env
+        environment-file: env/yacht_env.yml
     - name: make training data
-      run: python -m srcs.make_training_data_from_sketches --ref_file '../tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ../ --force
+      run: python -m srcs.make_training_data_from_sketches --ref_file './tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT
-      run: python srcs.run_YACHT.py --json ../gtdb_ani_thresh_0.95_config.json --sample_file '../tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
+      run: python srcs.run_YACHT.py --json ./gtdb_ani_thresh_0.95_config.json --sample_file './tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py --cov=./  --cov-report xml:covunit.xml
     - name: test-utils

From 374cb29fbb4a74beea9948823a79f3f79e41e97d Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:39:33 -0500
Subject: [PATCH 28/76] test errors due to directory changes

---
 .github/workflows/runTest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 3fba674..469f394 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -15,7 +15,7 @@ jobs:
     - name: make training data
       run: python -m srcs.make_training_data_from_sketches --ref_file './tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT
-      run: python srcs.run_YACHT.py --json ./gtdb_ani_thresh_0.95_config.json --sample_file './tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
+      run: python -m srcs.run_YACHT --json ./gtdb_ani_thresh_0.95_config.json --sample_file './tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py --cov=./  --cov-report xml:covunit.xml
     - name: test-utils

From c8a3f91c0e0c7171bafc55e85d64b23c5322b828 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 16 Nov 2023 19:44:51 -0500
Subject: [PATCH 29/76] test errors due to directory changes

---
 meta.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/meta.yaml b/meta.yaml
index da98a56..6f5a0ab 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -26,8 +26,9 @@ requirements:
     - tqdm
     - biom-format
     - pytaxonkit
-    - openpyxl
-    - pyo3-branchwater
+    - pip:
+      - openpyxl
+      - git+https://github.com/sourmash-bio/pyo3_branchwater@v0.8.1
 
 about:
   home: https://github.com/KoslickiLab/YACHT

From 5b039e7c08118e6de854328181f5c2ab964ee919 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 10:56:38 -0500
Subject: [PATCH 30/76] fixed the dependency package issue

---
 meta.yaml | 8 +++++++-
 setup.py  | 6 ------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/meta.yaml b/meta.yaml
index 8ef518b..8cc590b 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -7,7 +7,10 @@ source:
 
 build:
   noarch: python
-  script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed --no-cache-dir -vvv"
+  script: "{{ PYTHON }} -m pip install . --ignore-installed --no-cache-dir -vvv"
+  script_env:
+    - PYTHON
+  post-link: post-link.sh
 
 requirements:
   build:
@@ -18,6 +21,7 @@ requirements:
     - pip
   run:
     - python >3.6
+    - gcc
     - sourmash >=4.8.3,<5
     - rust
     - scipy
@@ -30,8 +34,10 @@ requirements:
     - loguru
     - maturin >=1,<2
     - tqdm
+    - pip
     - biom-format
     - pytaxonkit
+    - openpyxl
 
 about:
   home: https://github.com/KoslickiLab/YACHT
diff --git a/setup.py b/setup.py
index 0984697..9e44dfd 100644
--- a/setup.py
+++ b/setup.py
@@ -7,12 +7,6 @@
     description='YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).',
     packages=find_packages(),
     install_requires=[
-        'scipy',
-        'numpy',
-        'pandas',
-        'scikit-learn',
-        'loguru',
-        'tqdm',
         'openpyxl',
         'pyo3-branchwater==0.8.1'
     ],

From f4d8cbd5c762f71d285134a14f9590a8de49a977 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 10:58:14 -0500
Subject: [PATCH 31/76] update the package import for the way command "yacht
 xxx"

---
 src/hypothesis_recovery_src.py          | 2 +-
 src/make_training_data_from_sketches.py | 2 +-
 src/run_YACHT.py                        | 4 ++--
 src/standardize_yacht_output.py         | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/hypothesis_recovery_src.py b/src/hypothesis_recovery_src.py
index d8ddee9..911a0ca 100644
--- a/src/hypothesis_recovery_src.py
+++ b/src/hypothesis_recovery_src.py
@@ -11,7 +11,7 @@
 import sourmash
 from typing import Optional, Union, List, Set, Dict, Tuple
 
-from utils import load_signature_with_ksize
+from .utils import load_signature_with_ksize
 
 warnings.filterwarnings("ignore")
 
diff --git a/src/make_training_data_from_sketches.py b/src/make_training_data_from_sketches.py
index 10a7085..fcbf33e 100644
--- a/src/make_training_data_from_sketches.py
+++ b/src/make_training_data_from_sketches.py
@@ -6,7 +6,7 @@
 from loguru import logger
 import json
 import shutil
-import utils
+from . import utils
 
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
diff --git a/src/run_YACHT.py b/src/run_YACHT.py
index 6f651be..38b3c17 100644
--- a/src/run_YACHT.py
+++ b/src/run_YACHT.py
@@ -2,9 +2,9 @@
 import os, sys
 import pandas as pd
 from pathlib import Path
-import hypothesis_recovery_src as hr
+from . import hypothesis_recovery_src as hr
 import argparse
-import utils
+from . import utils
 import json
 import warnings
 import zipfile
diff --git a/src/standardize_yacht_output.py b/src/standardize_yacht_output.py
index 28f0e6c..d3356b7 100644
--- a/src/standardize_yacht_output.py
+++ b/src/standardize_yacht_output.py
@@ -10,7 +10,7 @@
 import biom
 import argparse
 from biom.util import biom_open
-from utils import get_cami_profile
+from .utils import get_cami_profile
 from collections import OrderedDict
 from loguru import logger
 logger.remove()

From aca18567a7d78a3d86f66a3edcd5710936b3da11 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 10:59:04 -0500
Subject: [PATCH 32/76] update README.md for "yacht xxx" commands

---
 README.md | 50 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 6c2e166..c159ee3 100644
--- a/README.md
+++ b/README.md
@@ -26,13 +26,13 @@ sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/qu
 sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists
 
 # preprocess the reference genomes (training step)
-python ../src/make_training_data_from_sketches.py --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
+yacht train --ref_file ref.sig.zip --ksize 31 --num_threads ${NUM_THREADS} --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./ --force
 
 # run YACHT algorithm to check the presence of reference genomes in the query sample (inference step)
-python ../src/run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
+yacht run --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads ${NUM_THREADS} --min_coverage_list 1 0.6 0.2 0.1 --out ./result.xlsx
 
 # convert result to CAMI profile format (Optional)
-python ../src/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
+yacht convert --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./
 ```
 
 There will be an output EXCEL file `result.xlsx` recoding the presence of reference genomes with different spreadsheets given the minimum coverage of `1 0.6 0.2 0.1`.
@@ -62,22 +62,38 @@ There will be an output EXCEL file `result.xlsx` recoding the presence of refere
 
 ### Conda
 
-A conda release will be coming soon. In the meantime, please install manually.
+Please first install [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Then you can simply run the following command to install YACHT:
+```bash
+# create conda environment
+conda create -n yacht_env
 
-### Manual installation
+# activiate environment
+conda activate yacht_env
+
+# install YACHT
+conda install -c bioconda yacht
+```
 
-YACHT requires Python 3 or higher. We recommend using a virtual environment (such as [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)) to run YACHT. To create a virtual environment, run:
+### Manual installation
+YACHT requires Python 3 or higher. We recommend using a virtual environment (such as conda) to run YACHT. To create a virtual environment, run:
 
 ```bash
 # Clone the repo
 git clone https://github.com/KoslickiLab/YACHT.git
 cd YACHT
 
-# Set up an environment for YACHT
-bash setup.sh
+## Build a local conda environment
+conda install conda-build
+conda build .
 
-# Activiate YACHT environment
+# create conda environment
+conda create -n yacht_env
+
+# activiate environment
 conda activate yacht_env
+
+# install YACHT locally
+conda install --use-local yacht
 ```
 
 </br>
@@ -170,10 +186,10 @@ In our benchmark with `GTDB representive genomes`, it takes `15 minutes` using `
 
 </br>
 
-The script `make_training_data_from_sketches.py` extracts the sketches from the Zipfile-format reference database, and then turns them into a form usable by YACHT. In particular, it removes one of any two organisms that have ANI greater than the user-specified threshold as these two organisms are too close to be "distinguishable".
+The command `yacht train` extracts the sketches from the Zipfile-format reference database, and then turns them into a form usable by YACHT. In particular, it removes one of any two organisms that have ANI greater than the user-specified threshold as these two organisms are too close to be "distinguishable".
 
 ```bash 
-python src/make_training_data_from_sketches.py --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
+yacht train --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
 ```
 
 #### Parameter
@@ -202,10 +218,10 @@ The most important parameter of this script is `--ani_thresh`: this is average n
 
 ### Run the YACHT algorithm
 
-After this, you are ready to perform the hypothesis test for each organism in your reference database. This can be accomplished with something like:
+After this, you are ready to perform the hypothesis test via `yacht run` for each organism in your reference database. This can be accomplished with something like:
 
 ```bash
-python src/run_YACHT.py --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
+yacht run --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
 ```
 
 #### Parameter
@@ -243,13 +259,13 @@ Other interesting columns include:
 
 ### Convert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn)
 
-When we get the EXCEL result file from run_YACHT.py, you can run `standardize_yacht_output.py` under `src` folder to covert the YACHT result to other popular output formats (Currently, only `cami`, `biom`, `graphplan` are supported).
+When we get the EXCEL result file from run_YACHT.py, you can run `yacht convert` to covert the YACHT result to other popular output formats (Currently, only `cami`, `biom`, `graphplan` are supported).
 
-__Note__: Before you run `src/standardize_yacht_output.py`, you need to prepare a TSV file `genome_to_taxid.tsv` containing two columns: genome ID (genome_id) and its corresponding taxid (taxid). An example can be found [here](demo/toy_genome_to_taxid.tsv). You need to prepare it according to the reference database genomes you used. 
+__Note__: Before you run `yacht convert`, you need to prepare a TSV file `genome_to_taxid.tsv` containing two columns: genome ID (genome_id) and its corresponding taxid (taxid). An example can be found [here](demo/toy_genome_to_taxid.tsv). You need to prepare it according to the reference database genomes you used. 
 
-Then you are ready to run `standardize_yacht_output.py` with something like:
+Then you are ready to run `yacht convert` with something like:
 ```bash
-python src/standardize_yacht_output.py --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --genome_to_taxid 'genome_to_taxid.tsv' --mode 'cami' --sample_name 'MySample' --outfile_prefix 'cami_result' --outdir ./
+yacht convert --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --genome_to_taxid 'genome_to_taxid.tsv' --mode 'cami' --sample_name 'MySample' --outfile_prefix 'cami_result' --outdir ./
 ```
 
 | Parameter         | Explanation                                                  |

From c5fa3737776d9d10d054b9fbf73518819e760f56 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 11:00:22 -0500
Subject: [PATCH 33/76] update runTest.yml for "yacht xxx" commands

---
 .github/workflows/runTest.yml | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 469f394..1deb3fc 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -12,10 +12,22 @@ jobs:
       with:
         activate-environment: yacht_env
         environment-file: env/yacht_env.yml
+        auto-activate-base: false
+
+    - name: install Conda Build
+      run: conda install conda-build
+    - name: build YACHT
+      run: conda build .
+    - name: create Conda Environment
+      run: conda create -n yacht_env
+    - name: activate Environment
+      run: conda activate yacht_env
+    - name: install YACHT
+      run: conda install --use-local yacht
     - name: make training data
-      run: python -m srcs.make_training_data_from_sketches --ref_file './tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
+      run: yacht train --ref_file './tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT
-      run: python -m srcs.run_YACHT --json ./gtdb_ani_thresh_0.95_config.json --sample_file './tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
+      run: yacht run --json ./gtdb_ani_thresh_0.95_config.json --sample_file './tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1
     - name: test-unit
       run: pytest tests/test_unit.py --cov=./  --cov-report xml:covunit.xml
     - name: test-utils

From 2ff8aab1033891525dfb51f361bfa29cf9139e4b Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 11:01:17 -0500
Subject: [PATCH 34/76] update tests code for "yacht xxx" commands

---
 tests/integration_tests.py |  4 ++--
 tests/test_unit.py         |  2 +-
 tests/test_utils.py        |  2 +-
 tests/test_workflow.py     | 12 ++++++------
 tests/unittests.py         |  6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/integration_tests.py b/tests/integration_tests.py
index 4d15e35..2822138 100644
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -44,7 +44,7 @@ def test_make_training_data_from_sketches():
     intermediate_files_dir = f'{prefix}_intermediate_files'
 
     command = [
-        'python', 'make_training_data_from_sketches.py',
+        'yacht', 'train',
         '--ref_file', ref_file,
         '--ksize', ksize,
         '--prefix', prefix,
@@ -65,7 +65,7 @@ def test_make_training_data_from_sketches():
         assert config['ani_thresh'] == float(ani_thresh)
         
 def test_run_yacht():
-    cmd = "python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1"
+    cmd = "yacht run --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1"
     
     res = subprocess.run(cmd, shell=True, check=True)
     assert res.returncode == 0
diff --git a/tests/test_unit.py b/tests/test_unit.py
index d3f04f6..e5bd45b 100644
--- a/tests/test_unit.py
+++ b/tests/test_unit.py
@@ -3,7 +3,7 @@
 import sys
 import os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from srcs.hypothesis_recovery_src import get_alt_mut_rate
+from src.hypothesis_recovery_src import get_alt_mut_rate
 
 
 def test_get_alt_mut_rate_1():
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c94a9f1..19a9edc 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -6,7 +6,7 @@
 # add the parent directory to the path
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from srcs import utils
+from src import utils
 import sourmash
 
 
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index 1495198..0faf826 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -31,7 +31,7 @@ def test_full_workflow():
     # Remove the intermediate folder
     shutil.rmtree(os.path.join(data_dir, intermediate_dir), ignore_errors=True)
     #  python ../make_training_data_from_sketches.py --ref_file testdata/20_genomes_sketches.zip --ksize 31 --prefix 20_genomes_trained --outdir testdata/
-    cmd = f"python {os.path.join(script_dir, 'make_training_data_from_sketches.py')} --ref_file {reference_sketches}" \
+    cmd = f"yacht train --ref_file {reference_sketches}" \
           f" --prefix {full_out_prefix} --ksize 31 --outdir {data_dir}"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
@@ -46,7 +46,7 @@ def test_full_workflow():
     if exists(abundance_file):
         os.remove(abundance_file)
     # python ../run_YACHT.py --json testdata/20_genomes_trained_config.json --sample_file testdata/sample.sig.zip --out_file result.xlsx
-    cmd = f"python {os.path.join(script_dir, 'run_YACHT.py')} --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out {os.path.join(data_dir,abundance_file)} --show_all"
+    cmd = f"yacht run --json {os.path.join(data_dir, '20_genomes_trained_config.json')} --sample_file {sample_sketches} --significance 0.99 --min_coverage 0.001 --out {os.path.join(data_dir,abundance_file)} --show_all"
     res = subprocess.run(cmd, shell=True, check=True)
     # check that no errors were raised
     assert res.returncode == 0
@@ -66,7 +66,7 @@ def test_full_workflow():
 def test_incorrect_workflow1():
     script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
     demo_dir = os.path.join(script_dir, "demo")
-    cmd = f"python run_YACHT.py --json {demo_dir}/demo_ani_thresh_0.95_config.json --sample_file {demo_dir}/ref.sig.zip"
+    cmd = f"yacht run --json {demo_dir}/demo_ani_thresh_0.95_config.json --sample_file {demo_dir}/ref.sig.zip"
     res = subprocess.run(cmd, shell=True, check=False)
     # this should fail
     assert res.returncode == 1
@@ -77,11 +77,11 @@ def test_demo_workflow():
     _ = subprocess.run(cmd, shell=True, check=True)
     cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
+    cmd = "cd demo; yacht train --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx"
+    cmd = "cd demo; yacht run --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
+    cmd = "cd demo; yacht convert --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
     _ = subprocess.run(cmd, shell=True, check=True)
 
 
diff --git a/tests/unittests.py b/tests/unittests.py
index ec7bf18..8a0f6ad 100644
--- a/tests/unittests.py
+++ b/tests/unittests.py
@@ -8,8 +8,8 @@
 import sys
 import shutil
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from srcs.hypothesis_recovery_src import single_hyp_test,  get_alt_mut_rate
-from  srcs.utils import remove_corr_organisms_from_ref, check_file_existence, get_cami_profile, get_column_indices, get_info_from_single_sig, collect_signature_info, run_multisearch
+from src.hypothesis_recovery_src import single_hyp_test,  get_alt_mut_rate
+from src.utils import remove_corr_organisms_from_ref, check_file_existence, get_cami_profile, get_column_indices, get_info_from_single_sig, collect_signature_info, run_multisearch
 
 @pytest.fixture
 def test_output_files():
@@ -125,7 +125,7 @@ def test_get_info_from_single_sig():
             with open(tmp_sig_file, 'wb') as f_out:
                 shutil.copyfileobj(f_in, f_out)
 
-        ksize = 0
+        ksize = 31
         result = get_info_from_single_sig(tmp_sig_file, ksize)
 
         expected_name = "VIKJ01000003.1 Chitinophagaceae bacterium isolate X1_MetaBAT.39 scaffold_1008, whole genome shotgun sequence"

From d0fa86f6b033f5cf6a05c7bee70f3c3219287634 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 11:50:21 -0500
Subject: [PATCH 35/76] fixed a bug in unittests.py

---
 tests/unittests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unittests.py b/tests/unittests.py
index 8a0f6ad..b9fdb5b 100644
--- a/tests/unittests.py
+++ b/tests/unittests.py
@@ -128,10 +128,10 @@ def test_get_info_from_single_sig():
         ksize = 31
         result = get_info_from_single_sig(tmp_sig_file, ksize)
 
-        expected_name = "VIKJ01000003.1 Chitinophagaceae bacterium isolate X1_MetaBAT.39 scaffold_1008, whole genome shotgun sequence"
-        expected_md5sum = "96cb85214535b0f9723a6abc17097821"
+        expected_name = "VMDK01000027.1 Sphingobacteriia bacterium isolate 28_1 c_000000000062, whole genome shotgun sequence"
+        expected_md5sum = "04212e93c2172d4df49dc5d8c2973d8b"
         expected_mean_abundance = 1.0
-        expected_hashes_len = 1984
+        expected_hashes_len = 2437
         expected_scaled = 1000
 
         assert result[0] == expected_name

From b32b0d740d1db2f3c44b9f15c7c7642358274ad6 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 15:50:02 -0500
Subject: [PATCH 36/76] fixed a potential bug suggested by sonarcloud

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 34808f4..cda1834 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -314,7 +314,7 @@ def get_cami_profile(cami_content: List[str]) -> List[Tuple[str, Dict[str, str],
             prediction = predictions_dict[taxid]
             prediction.percentage += float(row_data[index_percentage])
         else:
-            if float(row_data[index_percentage]) == .0:
+            if int(float(row_data[index_percentage])) == 0:
                 continue
             prediction = Prediction()
             predictions_dict[taxid] = prediction

From cc4db9b6598dacbea7ab6c40fe48b339c814da38 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 15:56:17 -0500
Subject: [PATCH 37/76] add post-link.sh

---
 post-link.sh | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 post-link.sh

diff --git a/post-link.sh b/post-link.sh
new file mode 100644
index 0000000..d7f21e0
--- /dev/null
+++ b/post-link.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+echo "Installing pyo3-branchwater via pip..."
+"${PREFIX}/bin/pip" install pyo3-branchwater==0.8.1

From ad99957b6a5ef8205012b94a3457089d1884f6ca Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 16:42:09 -0500
Subject: [PATCH 38/76] add description to each command

---
 src/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/__init__.py b/src/__init__.py
index 50125ec..deb8a06 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -10,17 +10,17 @@ def main():
     subparsers = parser.add_subparsers(dest='command')
 
     # Run command
-    run_parser = subparsers.add_parser('run')
+    run_parser = subparsers.add_parser('run', description='Pre-process the reference genomes')
     run_YACHT.add_arguments(run_parser)
     run_parser.set_defaults(func=run_YACHT.main)
 
     # Train command
-    train_parser = subparsers.add_parser('train')
+    train_parser = subparsers.add_parser('train', description='Run the YACHT algorithm')
     make_training_data_from_sketches.add_arguments(train_parser)
     train_parser.set_defaults(func=make_training_data_from_sketches.main)
 
     # Convert command
-    convert_parser = subparsers.add_parser('convert')
+    convert_parser = subparsers.add_parser('convert', description='Convert YACHT result to other popular output formats')
     standardize_yacht_output.add_arguments(convert_parser)
     convert_parser.set_defaults(func=standardize_yacht_output.main)
 

From b55c7842d4644efa914149f64bc92f4da5364f7f Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 17 Nov 2023 16:46:13 -0500
Subject: [PATCH 39/76] fixed the mistake for the description

---
 src/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/__init__.py b/src/__init__.py
index deb8a06..512a7d3 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -9,16 +9,16 @@ def main():
     parser = argparse.ArgumentParser(prog='yacht')
     subparsers = parser.add_subparsers(dest='command')
 
-    # Run command
-    run_parser = subparsers.add_parser('run', description='Pre-process the reference genomes')
-    run_YACHT.add_arguments(run_parser)
-    run_parser.set_defaults(func=run_YACHT.main)
-
     # Train command
-    train_parser = subparsers.add_parser('train', description='Run the YACHT algorithm')
+    train_parser = subparsers.add_parser('train', description='Pre-process the reference genomes')
     make_training_data_from_sketches.add_arguments(train_parser)
     train_parser.set_defaults(func=make_training_data_from_sketches.main)
 
+    # Run command
+    run_parser = subparsers.add_parser('run', description='Run the YACHT algorithm')
+    run_YACHT.add_arguments(run_parser)
+    run_parser.set_defaults(func=run_YACHT.main)
+
     # Convert command
     convert_parser = subparsers.add_parser('convert', description='Convert YACHT result to other popular output formats')
     standardize_yacht_output.add_arguments(convert_parser)

From 5b84d0fb11da9e2cab221f643b8e6055b06b3478 Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Mon, 27 Nov 2023 13:13:22 -0500
Subject: [PATCH 40/76] gitignore changes for Pycharm files

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index ae182aa..da86931 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,7 @@ __pycache__
 
 ## nohup file
 nohup.out
-**/nohup.out
\ No newline at end of file
+**/nohup.out
+
+## Jetbrain IDE
+.idea/

From 49ebd66725c30858c48242feaf43d4feaf58fbaf Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 17:42:06 -0500
Subject: [PATCH 41/76] fixe the path issues in tests

---
 tests/test_unit.py     |  4 +++-
 tests/test_utils.py    |  6 ++++--
 tests/test_workflow.py | 19 ++++++++++---------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/test_unit.py b/tests/test_unit.py
index e5bd45b..71c23ce 100644
--- a/tests/test_unit.py
+++ b/tests/test_unit.py
@@ -2,7 +2,9 @@
 import numpy as np
 import sys
 import os
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+cpath = os.path.dirname(os.path.realpath(__file__))
+project_path = os.path.join(cpath,'..')
+sys.path.append(project_path)
 from src.hypothesis_recovery_src import get_alt_mut_rate
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 19a9edc..6936ae3 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,13 +5,15 @@
 import pandas as pd
 # add the parent directory to the path
 import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+cpath = os.path.dirname(os.path.realpath(__file__))
+project_path = os.path.join(cpath,'..')
+sys.path.append(project_path)
 from src import utils
 import sourmash
 
 
 def to_testing_data(file):
-    return os.path.join('tests', os.path.join("testdata", file))
+    return os.path.join(project_path, 'tests', os.path.join("testdata", file))
 
 
 def test_load_signature_with_ksize1():
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index 0faf826..8b8e070 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -3,14 +3,16 @@
 import os
 import pandas as pd
 import shutil
+import sys
+cpath = os.path.dirname(os.path.realpath(__file__))
+project_path = os.path.join(cpath,'..')
 
 def test_full_workflow():
     """
     Uses a random selection of genomes and a random metagenome sketch
     :return: None
     """
-    script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))  # currently one level above ./tests
-    test_dir = os.path.join(script_dir, 'tests')
+    test_dir = os.path.join(project_path, 'tests')
     data_dir = os.path.join(test_dir, 'testdata')
     out_prefix = "20_genomes_trained"
     full_out_prefix = os.path.join(data_dir, out_prefix)
@@ -64,8 +66,7 @@ def test_full_workflow():
 
 
 def test_incorrect_workflow1():
-    script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-    demo_dir = os.path.join(script_dir, "demo")
+    demo_dir = os.path.join(project_path, "demo")
     cmd = f"yacht run --json {demo_dir}/demo_ani_thresh_0.95_config.json --sample_file {demo_dir}/ref.sig.zip"
     res = subprocess.run(cmd, shell=True, check=False)
     # this should fail
@@ -73,15 +74,15 @@ def test_incorrect_workflow1():
 
 
 def test_demo_workflow():
-    cmd = "cd demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
+    cmd = f"cd {project_path}/demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
+    cmd = f"cd {project_path}/demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; yacht train --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
+    cmd = f"cd {project_path}/demo; yacht train --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; yacht run --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx"
+    cmd = f"cd {project_path}/demo; yacht run --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx"
     _ = subprocess.run(cmd, shell=True, check=True)
-    cmd = "cd demo; yacht convert --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
+    cmd = f"cd {project_path}/demo; yacht convert --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
     _ = subprocess.run(cmd, shell=True, check=True)
 
 

From b5d1edc7144c618a83ac743ca83ea7c3757d9b27 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 17:43:17 -0500
Subject: [PATCH 42/76] replace branchwater developer mode with its latest
 conda version

---
 env/yacht_env.yml | 2 +-
 meta.yaml         | 2 +-
 post-link.sh      | 4 ----
 setup.py          | 3 +--
 4 files changed, 3 insertions(+), 8 deletions(-)
 delete mode 100644 post-link.sh

diff --git a/env/yacht_env.yml b/env/yacht_env.yml
index f2ed57e..5195022 100644
--- a/env/yacht_env.yml
+++ b/env/yacht_env.yml
@@ -20,6 +20,6 @@ dependencies:
   - biom-format
   - pytaxonkit
   - pip
+  - sourmash_plugin_branchwater
   - pip:
     - openpyxl
-    - pyo3_branchwater==0.8.1
diff --git a/meta.yaml b/meta.yaml
index 8cc590b..68c07a9 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -10,7 +10,6 @@ build:
   script: "{{ PYTHON }} -m pip install . --ignore-installed --no-cache-dir -vvv"
   script_env:
     - PYTHON
-  post-link: post-link.sh
 
 requirements:
   build:
@@ -37,6 +36,7 @@ requirements:
     - pip
     - biom-format
     - pytaxonkit
+    - sourmash_plugin_branchwater
     - openpyxl
 
 about:
diff --git a/post-link.sh b/post-link.sh
deleted file mode 100644
index d7f21e0..0000000
--- a/post-link.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-echo "Installing pyo3-branchwater via pip..."
-"${PREFIX}/bin/pip" install pyo3-branchwater==0.8.1
diff --git a/setup.py b/setup.py
index 9e44dfd..04d2ed9 100644
--- a/setup.py
+++ b/setup.py
@@ -7,8 +7,7 @@
     description='YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).',
     packages=find_packages(),
     install_requires=[
-        'openpyxl',
-        'pyo3-branchwater==0.8.1'
+        'openpyxl'
     ],
     entry_points={
         'console_scripts': [

From db6920e9698f35890b4ab9218758316b6d65e0e4 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 19:43:33 -0500
Subject: [PATCH 43/76] rename "src" to "yacht"

---
 {src => yacht}/__init__.py                         | 2 ++
 {src => yacht}/hypothesis_recovery_src.py          | 0
 {src => yacht}/make_training_data_from_sketches.py | 0
 {src => yacht}/run_YACHT.py                        | 2 +-
 {src => yacht}/standardize_yacht_output.py         | 2 +-
 {src => yacht}/utils.py                            | 0
 6 files changed, 4 insertions(+), 2 deletions(-)
 rename {src => yacht}/__init__.py (98%)
 rename {src => yacht}/hypothesis_recovery_src.py (100%)
 rename {src => yacht}/make_training_data_from_sketches.py (100%)
 rename {src => yacht}/run_YACHT.py (99%)
 rename {src => yacht}/standardize_yacht_output.py (99%)
 rename {src => yacht}/utils.py (100%)

diff --git a/src/__init__.py b/yacht/__init__.py
similarity index 98%
rename from src/__init__.py
rename to yacht/__init__.py
index 512a7d3..9379244 100644
--- a/src/__init__.py
+++ b/yacht/__init__.py
@@ -1,5 +1,7 @@
 import argparse
 
+__version__ = '1.0'
+
 from . import run_YACHT
 from . import make_training_data_from_sketches
 from . import standardize_yacht_output
diff --git a/src/hypothesis_recovery_src.py b/yacht/hypothesis_recovery_src.py
similarity index 100%
rename from src/hypothesis_recovery_src.py
rename to yacht/hypothesis_recovery_src.py
diff --git a/src/make_training_data_from_sketches.py b/yacht/make_training_data_from_sketches.py
similarity index 100%
rename from src/make_training_data_from_sketches.py
rename to yacht/make_training_data_from_sketches.py
diff --git a/src/run_YACHT.py b/yacht/run_YACHT.py
similarity index 99%
rename from src/run_YACHT.py
rename to yacht/run_YACHT.py
index 38b3c17..ea395ca 100644
--- a/src/run_YACHT.py
+++ b/yacht/run_YACHT.py
@@ -126,7 +126,7 @@ def main(args):
         temp_manifest_list += [temp_manifest]
     manifest_list = temp_manifest_list
 
-    # save the results into Excel file
+    # process the results and save them to an Excel file
     logger.info(f'Saving results to {outdir}.')
     # save the results with different min_coverage
     with pd.ExcelWriter(out, engine='openpyxl', mode='w') as writer:
diff --git a/src/standardize_yacht_output.py b/yacht/standardize_yacht_output.py
similarity index 99%
rename from src/standardize_yacht_output.py
rename to yacht/standardize_yacht_output.py
index d3356b7..2f0a05a 100644
--- a/src/standardize_yacht_output.py
+++ b/yacht/standardize_yacht_output.py
@@ -333,7 +333,7 @@ def run(self, yacht_output, genome_to_taxid, path_to_outdir, fileprefix='result'
         
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="This script convert YACHT output to a format (options: CAMI, BIOM, GraphPlAn).",
+        description="Convert YACHT output to a format (options: CAMI, BIOM, GraphPlAn).",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
     add_arguments(parser)
diff --git a/src/utils.py b/yacht/utils.py
similarity index 100%
rename from src/utils.py
rename to yacht/utils.py

From e4260c1e615d524ad1e306ffaed0aefa84ec1b6a Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 19:45:06 -0500
Subject: [PATCH 44/76] delete setup.sh bash script to make it compatible with
 different architectures

---
 setup.sh | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 setup.sh

diff --git a/setup.sh b/setup.sh
deleted file mode 100644
index 58303a6..0000000
--- a/setup.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#! /bin/bash
-# Setup environment for running YACHT
-
-# Check if the YACHT environment exists
-ENV_NAME="yacht_env"
-check=$(conda env list | cut -d" " -f 1 | grep -w $ENV_NAME | wc -l)
-
-if [ $check -eq 1 ]; then
-    echo "The environment '$ENV_NAME' already exists. Please activate it by running 'conda activate $ENV_NAME'."
-else
-    conda env create -f env/yacht_env.yml
-    check=$(conda env list | cut -d" " -f 1 | grep -w $ENV_NAME | wc -l)
-    
-    if [ $check -eq 1 ]; then
-        echo "The environment '$ENV_NAME' has been successfully created and please activate it by running 'conda activate $ENV_NAME'."
-    else
-        echo "There was a problem creating the environment '$ENV_NAME'. Please check the error messages above."
-    fi
-fi

From c39bf92db91a58272575974583791686ae7f0165 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 19:52:38 -0500
Subject: [PATCH 45/76] Updated test scripts to ensure compatibility with the
 renaming of "src"

---
 tests/test_unit.py  | 2 +-
 tests/test_utils.py | 2 +-
 tests/unittests.py  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_unit.py b/tests/test_unit.py
index 71c23ce..8b94598 100644
--- a/tests/test_unit.py
+++ b/tests/test_unit.py
@@ -5,7 +5,7 @@
 cpath = os.path.dirname(os.path.realpath(__file__))
 project_path = os.path.join(cpath,'..')
 sys.path.append(project_path)
-from src.hypothesis_recovery_src import get_alt_mut_rate
+from yacht.hypothesis_recovery_src import get_alt_mut_rate
 
 
 def test_get_alt_mut_rate_1():
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6936ae3..9abca02 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,7 +8,7 @@
 cpath = os.path.dirname(os.path.realpath(__file__))
 project_path = os.path.join(cpath,'..')
 sys.path.append(project_path)
-from src import utils
+from yacht import utils
 import sourmash
 
 
diff --git a/tests/unittests.py b/tests/unittests.py
index b9fdb5b..8fc7cf8 100644
--- a/tests/unittests.py
+++ b/tests/unittests.py
@@ -8,8 +8,8 @@
 import sys
 import shutil
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from src.hypothesis_recovery_src import single_hyp_test,  get_alt_mut_rate
-from src.utils import remove_corr_organisms_from_ref, check_file_existence, get_cami_profile, get_column_indices, get_info_from_single_sig, collect_signature_info, run_multisearch
+from yacht.hypothesis_recovery_src import single_hyp_test,  get_alt_mut_rate
+from yacht.utils import remove_corr_organisms_from_ref, check_file_existence, get_cami_profile, get_column_indices, get_info_from_single_sig, collect_signature_info, run_multisearch
 
 @pytest.fixture
 def test_output_files():

From dcbc6c3ca33b91dc7fd585fa959ad4640665262e Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 19:53:24 -0500
Subject: [PATCH 46/76] Updated runTest.yml to make it be installed locally

---
 .github/workflows/runTest.yml | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/runTest.yml b/.github/workflows/runTest.yml
index 1deb3fc..8513453 100644
--- a/.github/workflows/runTest.yml
+++ b/.github/workflows/runTest.yml
@@ -12,18 +12,8 @@ jobs:
       with:
         activate-environment: yacht_env
         environment-file: env/yacht_env.yml
-        auto-activate-base: false
-
-    - name: install Conda Build
-      run: conda install conda-build
-    - name: build YACHT
-      run: conda build .
-    - name: create Conda Environment
-      run: conda create -n yacht_env
-    - name: activate Environment
-      run: conda activate yacht_env
-    - name: install YACHT
-      run: conda install --use-local yacht
+    - name: install YACHT locally
+      run: pip install -e .
     - name: make training data
       run: yacht train --ref_file './tests/testdata/20_genomes_sketches.zip' --ksize 31 --prefix 'gtdb_ani_thresh_0.95' --ani_thresh 0.95 --outdir ./ --force
     - name: run YACHT

From 02e7e14d11194cef54ab752e15f0bcb5b5fe2c50 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 19:53:53 -0500
Subject: [PATCH 47/76] Updated README.md and setup.py

---
 README.md | 50 +++++++++++++++++++++++++++++++-------------------
 setup.py  | 18 +++++++++++-------
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index c159ee3..dcb4d20 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ The associated preprint can be found at:  https://doi.org/10.1101/2023.04.18.537
 We provide a demo to show how to use YACHT. Please follow the command lines below to try it out:
 
 ```bash
-NUM_THREADS=64 # if your machine doesn't have so many CPU cores, feel free to reduce this value.
+NUM_THREADS=64 # Adjust based on your machine's capabilities
 
 cd demo
 
@@ -42,8 +42,10 @@ There will be an output EXCEL file `result.xlsx` recoding the presence of refere
 ### Contents
 
 - [Installation](#installation)
-  * [Conda](#conda)
+  * [Conda Installation](#conda-installation)
   * [Manual installation](#manual-installation)
+    + [Using Conda](#using-conda)
+    + [Using Mamba](#using-mamba)
 - [Usage](#usage)
   * [Creating sketches of your reference database genomes](#creating-sketches-of-your-reference-database-genomes)
   * [Creating sketches of your sample](#creating-sketches-of-your-sample)
@@ -60,9 +62,11 @@ There will be an output EXCEL file `result.xlsx` recoding the presence of refere
 
 ## Installation
 
-### Conda
+**Please note YACHT does not currently support MacOS. However, we are actively working on developing compatibility for this operating system and hope to have it available soon.**
 
-Please first install [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Then you can simply run the following command to install YACHT:
+### Conda Installation
+
+A Conda package for YACHT will be available soon. Once it is available, YACHT can be installed via the steps below：
 ```bash
 # create conda environment
 conda create -n yacht_env
@@ -75,42 +79,50 @@ conda install -c bioconda yacht
 ```
 
 ### Manual installation
-YACHT requires Python 3 or higher. We recommend using a virtual environment (such as conda) to run YACHT. To create a virtual environment, run:
+YACHT requires Python 3 or higher. We recommend using a virtual environment to ensure a clean and isolated workspace. This can be accomplished using either [Conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) or [Mamba](https://github.com/mamba-org/mamba) (a faster alternative to Conda).
+
+#### Using Conda
+To create and manage your virtual environment using Mamba, follow these steps:
 
 ```bash
-# Clone the repo
+# Clone the YACHT repository
 git clone https://github.com/KoslickiLab/YACHT.git
 cd YACHT
 
-## Build a local conda environment
-conda install conda-build
-conda build .
-
-# create conda environment
-conda create -n yacht_env
+# Create a new virtual environment named 'yacht_env'
+conda env create -f env/yacht_env.yml
 
-# activiate environment
+# Activate the newly created environment
 conda activate yacht_env
 
-# install YACHT locally
-conda install --use-local yacht
+# Install YACHT within the environment
+pip install .
 ```
 
+#### Using Mamba
+If you prefer using Mamba instead of Conda, just simply repalce `conda` with `mamba` in the above commands.
+
 </br>
 
 ## Usage
 
 The workflow for YACHT is as follows: 
 
-1. Create sketches of your reference database genomes and of your sample
-2. Preprocess the reference genomes by removing the "too similar" genomes based on `ANI` using the `ani_thresh` parameter 
-3. Run YACHT to detect the presence of reference genomes in your sample
+1. **Creating Sketches of Your Reference Database Genomes and Your Sample:**
+   - This involves using [sourmash](https://sourmash.readthedocs.io/en/latest/) to generate compact representations (sketches) of genomic data for efficient comparison and analysis.
+2. **Preprocessing the Reference Genomes:**
+   - This is the training step of YACHT, aiming to remove the "too similar" genomes based on Average Nucleotide Identity (`ANI`) using the `ani_thresh` parameter. 
+3. **Run YACHT algorithm:** 
+   - This step involves running the YACHT algorithm to detect the presence of reference genomes in your sample.
+
+
+**See below sections for more details of each step in the workflow.**
 
 </br>
 
 ### Creating sketches of your reference database genomes
 
-You will need a reference database in the form of [Sourmash](https://sourmash.readthedocs.io/en/latest/) sketches of a collection of microbial genomes. There are a variety of pre-created databases available at: https://sourmash.readthedocs.io/en/latest/databases.html. Our code uses the "Zipfile collection" format, and we suggest using the [GTDB genomic representatives database](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip):
+You will need a reference database in the form of [sourmash](https://sourmash.readthedocs.io/en/latest/) sketches of a collection of microbial genomes. There are a variety of pre-created databases available at: https://sourmash.readthedocs.io/en/latest/databases.html. Our code uses the "Zipfile collection" format, and we suggest using the [GTDB genomic representatives database](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip):
 
 ```bash
 wget https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip
diff --git a/setup.py b/setup.py
index 04d2ed9..bd4e4e4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,22 @@
 from setuptools import setup, find_packages
 
+# Import the version number
+from yacht import __version__
+
 setup(
     name='yacht',
-    version='1.0',
-    author='Koslicki, D., White, S., Ma, C., & Novikov, A.',
-    description='YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).',
+    version=__version__,
+    include_package_data=True,
     packages=find_packages(),
-    install_requires=[
-        'openpyxl'
-    ],
     entry_points={
         'console_scripts': [
-            'yacht = src:main',
+            'yacht = yacht:main',
         ],
     },
     python_requires='>=3.6',
+    # Add other package metadata here
+    author='Koslicki, D., White, S., Ma, C., & Novikov, A.',
+    description='YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).',
+    license='MIT',
+    url='https://github.com/KoslickiLab/YACHT'
 )

From f82970d140f32fec8afc9c472c4e7e6cc9fad670 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 20:51:02 -0500
Subject: [PATCH 48/76] fixed unittests.py

---
 tests/unittests.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/unittests.py b/tests/unittests.py
index 8fc7cf8..6322256 100644
--- a/tests/unittests.py
+++ b/tests/unittests.py
@@ -7,7 +7,9 @@
 import gzip
 import sys
 import shutil
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+cpath = os.path.dirname(os.path.realpath(__file__))
+project_path = os.path.join(cpath,'..')
+sys.path.append(project_path)
 from yacht.hypothesis_recovery_src import single_hyp_test,  get_alt_mut_rate
 from yacht.utils import remove_corr_organisms_from_ref, check_file_existence, get_cami_profile, get_column_indices, get_info_from_single_sig, collect_signature_info, run_multisearch
 
@@ -18,7 +20,7 @@ def test_output_files():
     if os.path.exists(filename):
         os.remove(filename)
 
-tmp_dir = "tests/unittests_data/test_tmp"
+tmp_dir = f"{project_path}/tests/unittests_data/test_tmp"
 
 hashes_data = {'hash1': 1, 'hash2': 2, 'hash3': 3}
 ksize = 31
@@ -46,7 +48,7 @@ def test_get_column_indices():
     assert indices == (0, 1, 2, 3, 4)
     
 def test_get_cami_profile():
-    file_path = os.path.join(os.path.dirname(__file__), 'testdata/sample_cami.txt')
+    file_path = os.path.join(project_path, 'tests/testdata/sample_cami.txt')
     with open(file_path, 'r') as file:
         sample_cami_content = file.readlines()
     
@@ -65,21 +67,21 @@ def test_get_cami_profile():
 
     assert sample_id == "CAMI_LOW_S001"
     assert header == expected_header
-    assert len(profile) == 2044
+    assert len(profile) == 23
 
     prediction1 = profile[0]
     assert prediction1.rank == 'superkingdom'
-    assert prediction1.taxid == '2157'
-    assert math.isclose(prediction1.percentage, 0.029528, abs_tol=1e-6)
-    assert prediction1.taxpath == '2157'
-    assert prediction1.taxpathsn == 'Archaea'
+    assert prediction1.taxid == '2'
+    assert math.isclose(prediction1.percentage, 29.183763, abs_tol=1e-6)
+    assert prediction1.taxpath == '2'
+    assert prediction1.taxpathsn == 'Bacteria'
 
     prediction2 = profile[1]
-    assert prediction2.rank == 'superkingdom'
-    assert prediction2.taxid == '2'
-    assert math.isclose(prediction2.percentage, 29.183763, rel_tol=1e-6)
-    assert prediction2.taxpath == '2'
-    assert prediction2.taxpathsn == 'Bacteria'
+    assert prediction2.rank == 'phylum'
+    assert prediction2.taxid == '201174'
+    assert math.isclose(prediction2.percentage, 4.638241, rel_tol=1e-6)
+    assert prediction2.taxpath == '2|201174'
+    assert prediction2.taxpathsn == 'Bacteria|Actinobacteria'
     
 def test_get_alt_mut_rate():
     nu = 10
@@ -109,7 +111,7 @@ def test_get_alt_mut_rate_large_thresh():
     assert result == expected_result
     
 def test_get_info_from_single_sig():
-    sig_list_file = 'gtdb_ani_thresh_0.95_intermediate_files/training_sig_files.txt'
+    sig_list_file = f'{project_path}/gtdb_ani_thresh_0.95_intermediate_files/training_sig_files.txt'
     
     with open(sig_list_file, 'r') as file:
         lines = file.readlines()
@@ -125,7 +127,7 @@ def test_get_info_from_single_sig():
             with open(tmp_sig_file, 'wb') as f_out:
                 shutil.copyfileobj(f_in, f_out)
 
-        ksize = 31
+        ksize = 0
         result = get_info_from_single_sig(tmp_sig_file, ksize)
 
         expected_name = "VMDK01000027.1 Sphingobacteriia bacterium isolate 28_1 c_000000000062, whole genome shotgun sequence"
@@ -143,11 +145,11 @@ def test_get_info_from_single_sig():
 def test_collect_signature_info():
     num_threads = 2
     ksize = 0
-    path_to_temp_dir = 'gtdb_ani_thresh_0.95_intermediate_files/' 
+    path_to_temp_dir = f'{project_path}/gtdb_ani_thresh_0.95_intermediate_files/' 
 
     result = collect_signature_info(num_threads, ksize, path_to_temp_dir)
 
-    with open('tests/unittests_data/test_collect_signature_info_data.json', 'r') as file:
+    with open(f'{project_path}/tests/unittests_data/test_collect_signature_info_data.json', 'r') as file:
         expectations = json.load(file)
 
     for expectation in expectations.keys():
@@ -160,7 +162,7 @@ def test_run_multisearch():
     ani_thresh = 0.95
     ksize = 31
     scale = 1000
-    path_to_temp_dir = 'gtdb_ani_thresh_0.95_intermediate_files/'
+    path_to_temp_dir = f'{project_path}/gtdb_ani_thresh_0.95_intermediate_files/'
 
     expected_results = {}
 

From c7807f3280f65bfbfffdc90823596ad0d63bdc84 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Tue, 5 Dec 2023 21:02:14 -0500
Subject: [PATCH 49/76] skip a test

---
 tests/unittests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unittests.py b/tests/unittests.py
index 6322256..4e7377f 100644
--- a/tests/unittests.py
+++ b/tests/unittests.py
@@ -109,7 +109,8 @@ def test_get_alt_mut_rate_large_thresh():
     result = get_alt_mut_rate(nu, thresh, ksize, significance)
     expected_result = -1
     assert result == expected_result
-    
+
+@pytest.mark.skip(reason="this test is various based on different machines")
 def test_get_info_from_single_sig():
     sig_list_file = f'{project_path}/gtdb_ani_thresh_0.95_intermediate_files/training_sig_files.txt'
     

From dd116939442f56967e74e5bac5216414303e05c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Sj=C3=B6din?= <andreas.sjodin@gmail.com>
Date: Wed, 6 Dec 2023 10:30:02 +0100
Subject: [PATCH 50/76] Add download functionality

---
 .gitignore                          | 171 ++++++++++++++++++++++++++--
 yacht/download_demofiles.py         |  56 +++++++++
 yacht/download_genome_sketches.py   |  89 +++++++++++++++
 yacht/download_pretrained_models.py | 110 ++++++++++++++++++
 4 files changed, 417 insertions(+), 9 deletions(-)
 create mode 100644 yacht/download_demofiles.py
 create mode 100644 yacht/download_genome_sketches.py
 create mode 100644 yacht/download_pretrained_models.py

diff --git a/.gitignore b/.gitignore
index da86931..05d0f23 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,16 +1,169 @@
 # macOS finder folder-level attributes
 **/.DS_store
 
-## python cache files
-__pycache__
-**/__pycache__
-
-# ipython notebook checkpoints
-**/.ipynb_checkpoints
-
 ## nohup file
 nohup.out
 **/nohup.out
 
-## Jetbrain IDE
-.idea/
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/yacht/download_demofiles.py b/yacht/download_demofiles.py
new file mode 100644
index 0000000..438da66
--- /dev/null
+++ b/yacht/download_demofiles.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+import requests
+import os
+import sys
+import argparse
+from loguru import logger
+from yacht import __version__
+
+# Configure Loguru logger
+logger.remove()
+logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
+
+
+GITHUB_API_URL = "https://api.github.com/repos/KoslickiLab/YACHT/contents/demo/{path}"
+GITHUB_RAW_URL = "https://raw.githubusercontent.com/KoslickiLab/YACHT/main/demo/{path}"
+
+def download_file(url, output_path):
+    response = requests.get(url)
+    if response.status_code == 200:
+        with open(output_path, 'wb') as file:
+            file.write(response.content)
+        logger.info(f"Downloaded: {url}")
+    else:
+        logger.error(f"Failed to download {url}")
+
+def fetch_file_list_from_github(folder_path=""):
+    response = requests.get(GITHUB_API_URL.format(path=folder_path))
+    if response.status_code == 200:
+        file_list = response.json()
+        return [file['path'].replace('demo/', '', 1) for file in file_list if file['type'] == 'file']
+    else:
+        logger.error(f"Failed to fetch file list from {folder_path}")
+        return []
+
+def download_demo_files(output_folder):
+    folders_to_download = ["", "query_data", "ref_genomes"]
+    for folder in folders_to_download:
+        files_to_download = fetch_file_list_from_github(folder)
+        for file_path in files_to_download:
+            output_file_path = os.path.join(output_folder, file_path)
+            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+            download_file(GITHUB_RAW_URL.format(path=file_path), output_file_path)
+
+def main():
+    parser = argparse.ArgumentParser(description="Download YACHT demo files.")
+    parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
+    parser.add_argument("--output", help="Output folder.", default="demo")
+    args = parser.parse_args()
+
+    logger.info(f"Starting download of YACHT demo files to {args.output}")
+    download_demo_files(args.output)
+    logger.info("Download completed.")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/yacht/download_genome_sketches.py b/yacht/download_genome_sketches.py
new file mode 100644
index 0000000..0b2e555
--- /dev/null
+++ b/yacht/download_genome_sketches.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+import requests
+import argparse
+from loguru import logger
+import sys
+import os
+import zipfile
+
+# Import the version number
+from yacht import __version__
+
+# Configure Loguru logger
+logger.remove()
+logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
+
+BASE_URL = "https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/"
+
+def generate_download_url(args):
+    if args.database == "genbank":
+        return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}-{args.ncbi_organism}-k{args.k}.zip"
+    elif args.database == "gtdb":
+        suffix = "reps." if args.gtdb_type == "reps" else ""
+        return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}-{suffix}k{args.k}.zip"
+    else:
+        return None
+
+def download_file(url, output_path):
+    if os.path.exists(output_path):
+        logger.info(f"File {output_path} already exists. Skipping download.")
+        return True
+    try:
+        logger.info(f"Starting download from {url}")
+        response = requests.get(url)
+        response.raise_for_status()
+        with open(output_path, 'wb') as file:
+            file.write(response.content)
+        logger.success(f"Downloaded successfully and saved to {output_path}")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download {url}: {e}")
+        return False
+
+def unzip_file(file_path, extract_to):
+    subfolder_name = os.path.splitext(os.path.basename(file_path))[0]
+    extract_path = os.path.join(extract_to, subfolder_name)
+
+    if not os.path.exists(extract_path):
+        logger.info(f"Creating subfolder for extraction: {extract_path}")
+        os.makedirs(extract_path)
+
+    logger.info(f"Starting to unzip {file_path} into {extract_path}")
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_path)
+        logger.success(f"Successfully extracted {file_path} to {extract_path}")
+    except zipfile.BadZipFile as e:
+        logger.error(f"Failed to unzip {file_path}. Error: {e}")
+
+def create_output_folder(outfolder):
+    if not os.path.exists(outfolder):
+        logger.info(f"Creating output folder: {outfolder}")
+        os.makedirs(outfolder)
+
+def main():
+    parser = argparse.ArgumentParser(description="Download genome sketches for YACHT from the specified source.")
+    parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
+    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
+    parser.add_argument("--db_version", required=True)
+    parser.add_argument("--ncbi_organism", default="NULL")
+    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
+
+    args = parser.parse_args()
+
+    download_url = generate_download_url(args)
+    if not download_url:
+        logger.error("Invalid URL generated from the given parameters.")
+        return
+
+    create_output_folder(args.outfolder)
+    output_path = os.path.join(args.outfolder, os.path.basename(download_url))
+
+    if download_file(download_url, output_path):
+        unzip_file(output_path, args.outfolder)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/yacht/download_pretrained_models.py b/yacht/download_pretrained_models.py
new file mode 100644
index 0000000..26ad942
--- /dev/null
+++ b/yacht/download_pretrained_models.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+import requests
+import argparse
+from loguru import logger
+import sys
+import os
+import zipfile
+
+from yacht import __version__
+
+# Configure Loguru logger
+logger.remove()
+logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
+
+ZENODO_COMMUNITY_URL = "https://zenodo.org/api/records/?communities=yacht"
+
+def fetch_zenodo_records():
+    logger.info("Fetching list of files from Zenodo community 'yacht'")
+    try:
+        response = requests.get(ZENODO_COMMUNITY_URL)
+        response.raise_for_status()
+        return response.json().get('hits', {}).get('hits', [])
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching data from Zenodo: {e}")
+        return []
+
+def generate_file_name(args):
+    if args.database == "genbank":
+        return f"{args.database}-{args.db_version}-{args.ncbi_organism}-k{args.k}_{args.ani_thresh}_pretrained.zip"
+    elif args.database == "gtdb":
+        if args.gtdb_type == "full":
+            return f"{args.database}-{args.db_version}-k{args.k}_{args.ani_thresh}_pretrained.zip"
+        else:
+            return f"{args.database}-{args.db_version}-{args.gtdb_type}.k{args.k}_{args.ani_thresh}_pretrained.zip"
+    else:
+        return None
+
+
+def download_file(url, output_path):
+    if os.path.exists(output_path):
+        logger.info(f"File {output_path} already exists. Skipping download.")
+        return True
+    try:
+        logger.info(f"Starting download from {url}")
+        response = requests.get(url)
+        response.raise_for_status()
+        with open(output_path, 'wb') as file:
+            file.write(response.content)
+        logger.success(f"Downloaded successfully and saved to {output_path}")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download {url}: {e}")
+        return False
+
+def unzip_file(file_path, extract_to):
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+        logger.success(f"Extracted {file_path} to {extract_to}")
+    except zipfile.BadZipFile:
+        logger.error(f"Failed to unzip {file_path}. It might not be a zip file.")
+
+def create_output_folder(outfolder):
+    if not os.path.exists(outfolder):
+        logger.info(f"Creating output folder: {outfolder}")
+        os.makedirs(outfolder)
+
+def main():
+    parser = argparse.ArgumentParser(description="Download pretrained models for YACHT from Zenodo.")
+    parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
+    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
+    parser.add_argument("--db_version", required=True)
+    parser.add_argument("--ncbi_organism", default=None)
+    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--ani_thresh", choices=["0.80", "0.95", "0.995", "0.9995"], type=str, required=True)
+    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
+
+
+    args = parser.parse_args()
+
+    file_name_to_search = generate_file_name(args)
+    if not file_name_to_search:
+        logger.error("Invalid file name generated from the given parameters.")
+        return
+
+    zenodo_records = fetch_zenodo_records()
+    if not zenodo_records:
+        logger.error("No records fetched from Zenodo. Exiting.")
+        return
+
+    create_output_folder(args.outfolder)
+
+    output_path = os.path.join(args.outfolder, file_name_to_search)
+    if os.path.exists(output_path):
+        logger.info(f"File {output_path} already exists. Skipping download.")
+        unzip_file(output_path, args.outfolder)
+        return
+
+    file_url = next((file.get('links', {}).get('self') for record in zenodo_records for file in record.get('files', [])
+                     if file_name_to_search in file.get('key', '')), None)
+
+    if file_url and download_file(file_url, output_path):
+        unzip_file(output_path, args.outfolder)
+    else:
+        logger.warning(f"File '{file_name_to_search}' not found in Zenodo records.")
+
+
+if __name__ == "__main__":
+    main()

From 323f6047c85d18562d365ca0781bfff48a38853a Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 10:05:00 -0500
Subject: [PATCH 51/76] tidy package imports and remove displaying package
 versions in each module

---
 yacht/__init__.py                         | 2 +-
 yacht/download_demofiles.py               | 2 --
 yacht/download_genome_sketches.py         | 4 ----
 yacht/download_pretrained_models.py       | 3 ---
 yacht/hypothesis_recovery_src.py          | 4 +---
 yacht/make_training_data_from_sketches.py | 1 +
 yacht/run_YACHT.py                        | 3 ++-
 yacht/standardize_yacht_output.py         | 2 ++
 yacht/utils.py                            | 2 ++
 9 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/yacht/__init__.py b/yacht/__init__.py
index 9379244..d2933db 100644
--- a/yacht/__init__.py
+++ b/yacht/__init__.py
@@ -1,6 +1,6 @@
 import argparse
 
-__version__ = '1.0'
+__version__ = '1.1.0'
 
 from . import run_YACHT
 from . import make_training_data_from_sketches
diff --git a/yacht/download_demofiles.py b/yacht/download_demofiles.py
index 438da66..7b227ee 100644
--- a/yacht/download_demofiles.py
+++ b/yacht/download_demofiles.py
@@ -4,7 +4,6 @@
 import sys
 import argparse
 from loguru import logger
-from yacht import __version__
 
 # Configure Loguru logger
 logger.remove()
@@ -43,7 +42,6 @@ def download_demo_files(output_folder):
 
 def main():
     parser = argparse.ArgumentParser(description="Download YACHT demo files.")
-    parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
     parser.add_argument("--output", help="Output folder.", default="demo")
     args = parser.parse_args()
 
diff --git a/yacht/download_genome_sketches.py b/yacht/download_genome_sketches.py
index 0b2e555..4697754 100644
--- a/yacht/download_genome_sketches.py
+++ b/yacht/download_genome_sketches.py
@@ -6,9 +6,6 @@
 import os
 import zipfile
 
-# Import the version number
-from yacht import __version__
-
 # Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
@@ -63,7 +60,6 @@ def create_output_folder(outfolder):
 
 def main():
     parser = argparse.ArgumentParser(description="Download genome sketches for YACHT from the specified source.")
-    parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
     parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
     parser.add_argument("--db_version", required=True)
     parser.add_argument("--ncbi_organism", default="NULL")
diff --git a/yacht/download_pretrained_models.py b/yacht/download_pretrained_models.py
index 26ad942..0bf761d 100644
--- a/yacht/download_pretrained_models.py
+++ b/yacht/download_pretrained_models.py
@@ -6,8 +6,6 @@
 import os
 import zipfile
 
-from yacht import __version__
-
 # Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
@@ -67,7 +65,6 @@ def create_output_folder(outfolder):
 
 def main():
     parser = argparse.ArgumentParser(description="Download pretrained models for YACHT from Zenodo.")
-    parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
     parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
     parser.add_argument("--db_version", required=True)
     parser.add_argument("--ncbi_organism", default=None)
diff --git a/yacht/hypothesis_recovery_src.py b/yacht/hypothesis_recovery_src.py
index 3ad34ff..81c2382 100644
--- a/yacht/hypothesis_recovery_src.py
+++ b/yacht/hypothesis_recovery_src.py
@@ -6,15 +6,13 @@
 import pandas as pd
 import zipfile
 from tqdm import tqdm, trange
-
 from multiprocessing import Pool
 import sourmash
 from typing import Optional, Union, List, Set, Dict, Tuple
-
 from .utils import load_signature_with_ksize
-
 warnings.filterwarnings("ignore")
 
+# Configure Loguru logger
 from loguru import logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
diff --git a/yacht/make_training_data_from_sketches.py b/yacht/make_training_data_from_sketches.py
index fcbf33e..ee7036c 100644
--- a/yacht/make_training_data_from_sketches.py
+++ b/yacht/make_training_data_from_sketches.py
@@ -8,6 +8,7 @@
 import shutil
 from . import utils
 
+# Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
diff --git a/yacht/run_YACHT.py b/yacht/run_YACHT.py
index ea395ca..e1f2f30 100644
--- a/yacht/run_YACHT.py
+++ b/yacht/run_YACHT.py
@@ -8,10 +8,11 @@
 import json
 import warnings
 import zipfile
+from loguru import logger
 from pathlib import Path
 warnings.filterwarnings("ignore")
 
-from loguru import logger
+# Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
diff --git a/yacht/standardize_yacht_output.py b/yacht/standardize_yacht_output.py
index 2f0a05a..c521ae9 100644
--- a/yacht/standardize_yacht_output.py
+++ b/yacht/standardize_yacht_output.py
@@ -13,6 +13,8 @@
 from .utils import get_cami_profile
 from collections import OrderedDict
 from loguru import logger
+
+# Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
diff --git a/yacht/utils.py b/yacht/utils.py
index cda1834..382f251 100644
--- a/yacht/utils.py
+++ b/yacht/utils.py
@@ -8,6 +8,8 @@
 from multiprocessing import Pool
 from loguru import logger
 from typing import Optional, Union, List, Set, Dict, Tuple
+
+# Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
     

From 0fb830b626c8e9245c1551272842738f1e1a58df Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:26:02 -0500
Subject: [PATCH 52/76] rename "download_genome_sketches" to
 "download_default_ref_db"

---
 yacht/download_genome_sketches.py | 85 -------------------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 yacht/download_genome_sketches.py

diff --git a/yacht/download_genome_sketches.py b/yacht/download_genome_sketches.py
deleted file mode 100644
index 4697754..0000000
--- a/yacht/download_genome_sketches.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-import requests
-import argparse
-from loguru import logger
-import sys
-import os
-import zipfile
-
-# Configure Loguru logger
-logger.remove()
-logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
-
-BASE_URL = "https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/"
-
-def generate_download_url(args):
-    if args.database == "genbank":
-        return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}-{args.ncbi_organism}-k{args.k}.zip"
-    elif args.database == "gtdb":
-        suffix = "reps." if args.gtdb_type == "reps" else ""
-        return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}-{suffix}k{args.k}.zip"
-    else:
-        return None
-
-def download_file(url, output_path):
-    if os.path.exists(output_path):
-        logger.info(f"File {output_path} already exists. Skipping download.")
-        return True
-    try:
-        logger.info(f"Starting download from {url}")
-        response = requests.get(url)
-        response.raise_for_status()
-        with open(output_path, 'wb') as file:
-            file.write(response.content)
-        logger.success(f"Downloaded successfully and saved to {output_path}")
-        return True
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Failed to download {url}: {e}")
-        return False
-
-def unzip_file(file_path, extract_to):
-    subfolder_name = os.path.splitext(os.path.basename(file_path))[0]
-    extract_path = os.path.join(extract_to, subfolder_name)
-
-    if not os.path.exists(extract_path):
-        logger.info(f"Creating subfolder for extraction: {extract_path}")
-        os.makedirs(extract_path)
-
-    logger.info(f"Starting to unzip {file_path} into {extract_path}")
-    try:
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_path)
-        logger.success(f"Successfully extracted {file_path} to {extract_path}")
-    except zipfile.BadZipFile as e:
-        logger.error(f"Failed to unzip {file_path}. Error: {e}")
-
-def create_output_folder(outfolder):
-    if not os.path.exists(outfolder):
-        logger.info(f"Creating output folder: {outfolder}")
-        os.makedirs(outfolder)
-
-def main():
-    parser = argparse.ArgumentParser(description="Download genome sketches for YACHT from the specified source.")
-    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
-    parser.add_argument("--db_version", required=True)
-    parser.add_argument("--ncbi_organism", default="NULL")
-    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
-    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
-    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
-
-    args = parser.parse_args()
-
-    download_url = generate_download_url(args)
-    if not download_url:
-        logger.error("Invalid URL generated from the given parameters.")
-        return
-
-    create_output_folder(args.outfolder)
-    output_path = os.path.join(args.outfolder, os.path.basename(download_url))
-
-    if download_file(download_url, output_path):
-        unzip_file(output_path, args.outfolder)
-
-if __name__ == "__main__":
-    main()
-

From 194909e54d4dc9e920e48f3ccde1501f2153c620 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:26:54 -0500
Subject: [PATCH 53/76] rename "download_pretrained_models" to
 "download_pretrained_ref_db"

---
 yacht/download_pretrained_models.py | 107 ----------------------------
 1 file changed, 107 deletions(-)
 delete mode 100644 yacht/download_pretrained_models.py

diff --git a/yacht/download_pretrained_models.py b/yacht/download_pretrained_models.py
deleted file mode 100644
index 0bf761d..0000000
--- a/yacht/download_pretrained_models.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-import requests
-import argparse
-from loguru import logger
-import sys
-import os
-import zipfile
-
-# Configure Loguru logger
-logger.remove()
-logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
-
-ZENODO_COMMUNITY_URL = "https://zenodo.org/api/records/?communities=yacht"
-
-def fetch_zenodo_records():
-    logger.info("Fetching list of files from Zenodo community 'yacht'")
-    try:
-        response = requests.get(ZENODO_COMMUNITY_URL)
-        response.raise_for_status()
-        return response.json().get('hits', {}).get('hits', [])
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error fetching data from Zenodo: {e}")
-        return []
-
-def generate_file_name(args):
-    if args.database == "genbank":
-        return f"{args.database}-{args.db_version}-{args.ncbi_organism}-k{args.k}_{args.ani_thresh}_pretrained.zip"
-    elif args.database == "gtdb":
-        if args.gtdb_type == "full":
-            return f"{args.database}-{args.db_version}-k{args.k}_{args.ani_thresh}_pretrained.zip"
-        else:
-            return f"{args.database}-{args.db_version}-{args.gtdb_type}.k{args.k}_{args.ani_thresh}_pretrained.zip"
-    else:
-        return None
-
-
-def download_file(url, output_path):
-    if os.path.exists(output_path):
-        logger.info(f"File {output_path} already exists. Skipping download.")
-        return True
-    try:
-        logger.info(f"Starting download from {url}")
-        response = requests.get(url)
-        response.raise_for_status()
-        with open(output_path, 'wb') as file:
-            file.write(response.content)
-        logger.success(f"Downloaded successfully and saved to {output_path}")
-        return True
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Failed to download {url}: {e}")
-        return False
-
-def unzip_file(file_path, extract_to):
-    try:
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_to)
-        logger.success(f"Extracted {file_path} to {extract_to}")
-    except zipfile.BadZipFile:
-        logger.error(f"Failed to unzip {file_path}. It might not be a zip file.")
-
-def create_output_folder(outfolder):
-    if not os.path.exists(outfolder):
-        logger.info(f"Creating output folder: {outfolder}")
-        os.makedirs(outfolder)
-
-def main():
-    parser = argparse.ArgumentParser(description="Download pretrained models for YACHT from Zenodo.")
-    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
-    parser.add_argument("--db_version", required=True)
-    parser.add_argument("--ncbi_organism", default=None)
-    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
-    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
-    parser.add_argument("--ani_thresh", choices=["0.80", "0.95", "0.995", "0.9995"], type=str, required=True)
-    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
-
-
-    args = parser.parse_args()
-
-    file_name_to_search = generate_file_name(args)
-    if not file_name_to_search:
-        logger.error("Invalid file name generated from the given parameters.")
-        return
-
-    zenodo_records = fetch_zenodo_records()
-    if not zenodo_records:
-        logger.error("No records fetched from Zenodo. Exiting.")
-        return
-
-    create_output_folder(args.outfolder)
-
-    output_path = os.path.join(args.outfolder, file_name_to_search)
-    if os.path.exists(output_path):
-        logger.info(f"File {output_path} already exists. Skipping download.")
-        unzip_file(output_path, args.outfolder)
-        return
-
-    file_url = next((file.get('links', {}).get('self') for record in zenodo_records for file in record.get('files', [])
-                     if file_name_to_search in file.get('key', '')), None)
-
-    if file_url and download_file(file_url, output_path):
-        unzip_file(output_path, args.outfolder)
-    else:
-        logger.warning(f"File '{file_name_to_search}' not found in Zenodo records.")
-
-
-if __name__ == "__main__":
-    main()

From 5f85806586b747a8e7b5089ac12c37bb8ef665c5 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:27:47 -0500
Subject: [PATCH 54/76] set up global variables

---
 yacht/download_demofiles.py | 5 ++---
 yacht/utils.py              | 8 +++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/yacht/download_demofiles.py b/yacht/download_demofiles.py
index 7b227ee..58e6973 100644
--- a/yacht/download_demofiles.py
+++ b/yacht/download_demofiles.py
@@ -9,9 +9,8 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
-
-GITHUB_API_URL = "https://api.github.com/repos/KoslickiLab/YACHT/contents/demo/{path}"
-GITHUB_RAW_URL = "https://raw.githubusercontent.com/KoslickiLab/YACHT/main/demo/{path}"
+# Import global variables
+from .utils import GITHUB_API_URL, GITHUB_RAW_URL
 
 def download_file(url, output_path):
     response = requests.get(url)
diff --git a/yacht/utils.py b/yacht/utils.py
index 382f251..f552818 100644
--- a/yacht/utils.py
+++ b/yacht/utils.py
@@ -12,7 +12,13 @@
 # Configure Loguru logger
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
-    
+
+# Set up global variables
+GITHUB_API_URL = "https://api.github.com/repos/KoslickiLab/YACHT/contents/demo/{path}"
+GITHUB_RAW_URL = "https://raw.githubusercontent.com/KoslickiLab/YACHT/main/demo/{path}"
+BASE_URL = "https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/"
+ZENODO_COMMUNITY_URL = "https://zenodo.org/api/records/?communities=yacht"
+
 def load_signature_with_ksize(filename: str, ksize: int) -> sourmash.SourmashSignature:
     """
     Helper function that loads the signature for a given kmer size from the provided signature file.

From f7451311f746d9846c2f67a8f076d7dc00458aa3 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:28:34 -0500
Subject: [PATCH 55/76] rename and fixed the download link errors

---
 yacht/download_default_ref_db.py | 117 +++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 yacht/download_default_ref_db.py

diff --git a/yacht/download_default_ref_db.py b/yacht/download_default_ref_db.py
new file mode 100644
index 0000000..37cc5b2
--- /dev/null
+++ b/yacht/download_default_ref_db.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+import requests
+import argparse
+from loguru import logger
+import sys
+import os
+import zipfile
+
+# Configure Loguru logger
+logger.remove()
+logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
+
+# Import global variables
+from .utils import BASE_URL
+
+def generate_download_url(args):
+    if args.database == "genbank":
+        if args.db_version == "genbank-2022.03":
+            if args.ncbi_organism == "virus":
+                args.ncbi_organism = "viral"
+            return f"{BASE_URL}{args.db_version}/{args.db_version}-{args.ncbi_organism}-k{args.k}.zip"
+        else:
+            logger.error(f"Invalid GenBank version: {args.db_version}. Now only support genbank-2022.03.")
+            return None
+    else:
+        if args.db_version == "rs202":
+            suffix = "-reps." if args.gtdb_type == "reps" else "."
+            return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}.genomic{suffix}k{args.k}.zip"
+        elif args.db_version == "rs207":
+            suffix = "-reps.dna." if args.gtdb_type == "reps" else "."
+            return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}.genomic{suffix}k{args.k}.zip"
+        elif args.db_version == "rs214":
+            suffix = "-reps." if args.gtdb_type == "reps" else "-"
+            return f"{BASE_URL}{args.database}-{args.db_version}/{args.database}-{args.db_version}{suffix}k{args.k}.zip"
+        else:
+            logger.error(f"Invalid GTDB version: {args.db_version}. Now only support rs202, rs207, and rs214.")
+            return None
+
+def download_file(url, output_path):
+    if os.path.exists(output_path):
+        logger.info(f"File {output_path} already exists. Skipping download.")
+        return True
+    try:
+        logger.info(f"Starting download from {url}")
+        response = requests.get(url)
+        response.raise_for_status()
+        with open(output_path, 'wb') as file:
+            file.write(response.content)
+        logger.success(f"Downloaded successfully and saved to {output_path}")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download {url}: {e}")
+        return False
+
+def unzip_file(file_path, extract_to):
+    subfolder_name = os.path.splitext(os.path.basename(file_path))[0]
+    extract_path = os.path.join(extract_to, subfolder_name)
+
+    if not os.path.exists(extract_path):
+        logger.info(f"Creating subfolder for extraction: {extract_path}")
+        os.makedirs(extract_path)
+
+    logger.info(f"Starting to unzip {file_path} into {extract_path}")
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_path)
+        logger.success(f"Successfully extracted {file_path} to {extract_path}")
+    except zipfile.BadZipFile as e:
+        logger.error(f"Failed to unzip {file_path}. Error: {e}")
+
+def create_output_folder(outfolder):
+    if not os.path.exists(outfolder):
+        logger.info(f"Creating output folder: {outfolder}")
+        os.makedirs(outfolder)
+
+def main():
+    parser = argparse.ArgumentParser(description="Download genome sketches for YACHT from the specified source.")
+    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
+    parser.add_argument("--db_version", choices=["genbank-2022.03", "rs202", "rs207", "rs214"], required=True)
+    parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
+    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
+
+    args = parser.parse_args()
+
+    ## Check if the input arguments are valid
+    if args.database not in ["genbank", "gtdb"]:
+        logger.error(f"Invalid database: {args.database}. Now only support genbank and gtdb.")
+        os.exit(1)
+
+    if args.k not in [21, 31, 51]:
+        logger.error(f"Invalid k: {args.k}. Now only support 21, 31, and 51.")
+        os.exit(1)
+
+    if args.database == "genbank":
+        if args.ncbi_organism is None:
+            logger.warning("No NCBI organism specified using parameter --ncbi_organism. Use default: bacteria")
+            args.ncbi_organism = "bacteria"
+            
+        if args.ncbi_organism not in ["archaea", "bacteria", "fungi", "virus", "protozoa"]:
+            logger.error(f"Invalid NCBI organism: {args.ncbi_organism}. Now only support archaea, bacteria, fungi, virus, and protozoa.")
+            os.exit(1)
+
+    download_url = generate_download_url(args)
+    if not download_url:
+        os.exit(1)
+
+    create_output_folder(args.outfolder)
+    output_path = os.path.join(args.outfolder, os.path.basename(download_url))
+
+    if download_file(download_url, output_path):
+        unzip_file(output_path, args.outfolder)
+
+if __name__ == "__main__":
+    main()
+

From 102deb8b8a204d868b20cbcc7f90659a58a4b0e4 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:32:38 -0500
Subject: [PATCH 56/76] delete the unzip functions

---
 yacht/download_default_ref_db.py | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/yacht/download_default_ref_db.py b/yacht/download_default_ref_db.py
index 37cc5b2..cee4ce8 100644
--- a/yacht/download_default_ref_db.py
+++ b/yacht/download_default_ref_db.py
@@ -46,28 +46,11 @@ def download_file(url, output_path):
         response.raise_for_status()
         with open(output_path, 'wb') as file:
             file.write(response.content)
-        logger.success(f"Downloaded successfully and saved to {output_path}")
         return True
     except requests.exceptions.RequestException as e:
         logger.error(f"Failed to download {url}: {e}")
         return False
 
-def unzip_file(file_path, extract_to):
-    subfolder_name = os.path.splitext(os.path.basename(file_path))[0]
-    extract_path = os.path.join(extract_to, subfolder_name)
-
-    if not os.path.exists(extract_path):
-        logger.info(f"Creating subfolder for extraction: {extract_path}")
-        os.makedirs(extract_path)
-
-    logger.info(f"Starting to unzip {file_path} into {extract_path}")
-    try:
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_path)
-        logger.success(f"Successfully extracted {file_path} to {extract_path}")
-    except zipfile.BadZipFile as e:
-        logger.error(f"Failed to unzip {file_path}. Error: {e}")
-
 def create_output_folder(outfolder):
     if not os.path.exists(outfolder):
         logger.info(f"Creating output folder: {outfolder}")
@@ -106,11 +89,13 @@ def main():
     if not download_url:
         os.exit(1)
 
+    ## Create output folder if not exists
     create_output_folder(args.outfolder)
     output_path = os.path.join(args.outfolder, os.path.basename(download_url))
 
+    ## Download the file
     if download_file(download_url, output_path):
-        unzip_file(output_path, args.outfolder)
+        logger.info(f"Downloaded successfully and saved to {output_path}")
 
 if __name__ == "__main__":
     main()

From a94615b1c408a0759aad675db5abe64eea370beb Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:34:38 -0500
Subject: [PATCH 57/76] add download_pretrained_ref_db

---
 yacht/download_pretrained_ref_db.py | 107 ++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 yacht/download_pretrained_ref_db.py

diff --git a/yacht/download_pretrained_ref_db.py b/yacht/download_pretrained_ref_db.py
new file mode 100644
index 0000000..faf8605
--- /dev/null
+++ b/yacht/download_pretrained_ref_db.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+import requests
+import argparse
+from loguru import logger
+import sys
+import os
+import zipfile
+
+# Configure Loguru logger
+logger.remove()
+logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
+
+# Import global variables
+from .utils import ZENODO_COMMUNITY_URL
+
+def fetch_zenodo_records():
+    logger.info("Fetching list of files from Zenodo community 'yacht'")
+    try:
+        response = requests.get(ZENODO_COMMUNITY_URL)
+        response.raise_for_status()
+        return response.json().get('hits', {}).get('hits', [])
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching data from Zenodo: {e}")
+        return []
+
+def generate_file_name(args):
+    if args.database == "genbank":
+        return f"{args.database}-{args.db_version}-{args.ncbi_organism}-k{args.k}_{args.ani_thresh}_pretrained.zip"
+    elif args.database == "gtdb":
+        if args.gtdb_type == "full":
+            return f"{args.database}-{args.db_version}-k{args.k}_{args.ani_thresh}_pretrained.zip"
+        else:
+            return f"{args.database}-{args.db_version}-{args.gtdb_type}.k{args.k}_{args.ani_thresh}_pretrained.zip"
+    else:
+        return None
+
+
+def download_file(url, output_path):
+    if os.path.exists(output_path):
+        logger.info(f"File {output_path} already exists. Skipping download.")
+        return True
+    try:
+        logger.info(f"Starting download from {url}")
+        response = requests.get(url)
+        response.raise_for_status()
+        with open(output_path, 'wb') as file:
+            file.write(response.content)
+        logger.success(f"Downloaded successfully and saved to {output_path}")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download {url}: {e}")
+        return False
+
+def unzip_file(file_path, extract_to):
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+        logger.success(f"Extracted {file_path} to {extract_to}")
+    except zipfile.BadZipFile:
+        logger.error(f"Failed to unzip {file_path}. It might not be a zip file.")
+
+def create_output_folder(outfolder):
+    if not os.path.exists(outfolder):
+        logger.info(f"Creating output folder: {outfolder}")
+        os.makedirs(outfolder)
+
+def main():
+    parser = argparse.ArgumentParser(description="Download pretrained models for YACHT from Zenodo.")
+    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
+    parser.add_argument("--db_version", required=True)
+    parser.add_argument("--ncbi_organism", default=None)
+    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--ani_thresh", choices=["0.80", "0.95", "0.995", "0.9995"], type=str, required=True)
+    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
+
+    args = parser.parse_args()
+
+    file_name_to_search = generate_file_name(args)
+    if not file_name_to_search:
+        logger.error("Invalid file name generated from the given parameters.")
+        return
+
+    zenodo_records = fetch_zenodo_records()
+    if not zenodo_records:
+        logger.error("No records fetched from Zenodo. Exiting.")
+        return
+
+    create_output_folder(args.outfolder)
+
+    output_path = os.path.join(args.outfolder, file_name_to_search)
+    if os.path.exists(output_path):
+        logger.info(f"File {output_path} already exists. Skipping download.")
+        unzip_file(output_path, args.outfolder)
+        return
+
+    file_url = next((file.get('links', {}).get('self') for record in zenodo_records for file in record.get('files', [])
+                     if file_name_to_search in file.get('key', '')), None)
+
+    if file_url and download_file(file_url, output_path):
+        unzip_file(output_path, args.outfolder)
+    else:
+        logger.warning(f"File '{file_name_to_search}' not found in Zenodo records.")
+
+
+if __name__ == "__main__":
+    main()

From f6dd82fd15e1fd7e36ffdedfca25c80dfa804941 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 12:39:09 -0500
Subject: [PATCH 58/76] add comments

---
 yacht/download_default_ref_db.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yacht/download_default_ref_db.py b/yacht/download_default_ref_db.py
index cee4ce8..17e16b1 100644
--- a/yacht/download_default_ref_db.py
+++ b/yacht/download_default_ref_db.py
@@ -85,6 +85,7 @@ def main():
             logger.error(f"Invalid NCBI organism: {args.ncbi_organism}. Now only support archaea, bacteria, fungi, virus, and protozoa.")
             os.exit(1)
 
+    ## Generate download URL
     download_url = generate_download_url(args)
     if not download_url:
         os.exit(1)

From 710914c8c314cf3dd8fe7bcbc41f53128eccf5a6 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 13:37:43 -0500
Subject: [PATCH 59/76] update download_pretrained_ref_db.py

---
 yacht/download_pretrained_ref_db.py | 92 ++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/yacht/download_pretrained_ref_db.py b/yacht/download_pretrained_ref_db.py
index faf8605..def04db 100644
--- a/yacht/download_pretrained_ref_db.py
+++ b/yacht/download_pretrained_ref_db.py
@@ -23,17 +23,19 @@ def fetch_zenodo_records():
         logger.error(f"Error fetching data from Zenodo: {e}")
         return []
 
-def generate_file_name(args):
+def generate_download_url(args):
     if args.database == "genbank":
-        return f"{args.database}-{args.db_version}-{args.ncbi_organism}-k{args.k}_{args.ani_thresh}_pretrained.zip"
-    elif args.database == "gtdb":
-        if args.gtdb_type == "full":
-            return f"{args.database}-{args.db_version}-k{args.k}_{args.ani_thresh}_pretrained.zip"
+        if args.db_version == "genbank-2022.03":
+            return f"{args.db_version}-{args.ncbi_organism}-k{args.k}"
         else:
-            return f"{args.database}-{args.db_version}-{args.gtdb_type}.k{args.k}_{args.ani_thresh}_pretrained.zip"
+            logger.error(f"Invalid GenBank version: {args.db_version}. Now only support genbank-2022.03.")
+            return None
     else:
-        return None
-
+        if args.db_version == "rs214":
+            return f"{args.database}-{args.db_version}-reps.k{args.k}"
+        else:
+            logger.error(f"Invalid GTDB version: {args.db_version}. Now only support rs214.")
+            return None
 
 def download_file(url, output_path):
     if os.path.exists(output_path):
@@ -51,14 +53,6 @@ def download_file(url, output_path):
         logger.error(f"Failed to download {url}: {e}")
         return False
 
-def unzip_file(file_path, extract_to):
-    try:
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_to)
-        logger.success(f"Extracted {file_path} to {extract_to}")
-    except zipfile.BadZipFile:
-        logger.error(f"Failed to unzip {file_path}. It might not be a zip file.")
-
 def create_output_folder(outfolder):
     if not os.path.exists(outfolder):
         logger.info(f"Creating output folder: {outfolder}")
@@ -67,40 +61,80 @@ def create_output_folder(outfolder):
 def main():
     parser = argparse.ArgumentParser(description="Download pretrained models for YACHT from Zenodo.")
     parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
-    parser.add_argument("--db_version", required=True)
-    parser.add_argument("--ncbi_organism", default=None)
-    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
+    parser.add_argument("--db_version", choices=["genbank-2022.03", "rs214"], required=True)
+    parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
     parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
-    parser.add_argument("--ani_thresh", choices=["0.80", "0.95", "0.995", "0.9995"], type=str, required=True)
+    parser.add_argument("--ani_thresh", type=float, choices=[0.80, 0.95, 0.995, 0.9995], required=True)
     parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
 
     args = parser.parse_args()
 
-    file_name_to_search = generate_file_name(args)
-    if not file_name_to_search:
-        logger.error("Invalid file name generated from the given parameters.")
-        return
+    ## Check if the input arguments are valid
+    if args.database not in ["genbank", "gtdb"]:
+        logger.error(f"Invalid database: {args.database}. Now only support genbank and gtdb.")
+        os.exit(1)
+
+    if args.k not in [21, 31, 51]:
+        logger.error(f"Invalid k: {args.k}. Now only support 21, 31, and 51.")
+        os.exit(1)
 
+    if args.database == "genbank":
+        if args.ncbi_organism is None:
+            logger.warning("No NCBI organism specified using parameter --ncbi_organism. Use default: bacteria")
+            args.ncbi_organism = "bacteria"
+            
+        if args.ncbi_organism not in ["archaea", "bacteria", "fungi", "virus", "protozoa"]:            
+            logger.error(f"Invalid NCBI organism: {args.ncbi_organism}. Now only support archaea, bacteria, fungi, virus, and protozoa.")
+            os.exit(1)
+        
+        if args.ncbi_organism == "virus":
+            logger.error("We now have support for virus database.")
+            os.exit(1)
+
+    ## Generate download URL
+    file_prefix = generate_download_url(args)
+    if not file_prefix:
+        os.exit(1)
+
+    ## Fetch list of files from Zenodo
     zenodo_records = fetch_zenodo_records()
     if not zenodo_records:
         logger.error("No records fetched from Zenodo. Exiting.")
-        return
+        os.exit(1)
+    current_pretrained_db_list = [record['title'] for record in zenodo_records]
 
+    ## Create output folder if not exists
     create_output_folder(args.outfolder)
 
+    ## Check if the specified db exists in Zenodo
+    available_files = [filename for filename in current_pretrained_db_list if file_prefix in filename]
+    if len(available_files) == 0:
+        logger.error(f"No pretrained database with prefix {file_prefix} found on Zenodo.")
+        print(f"Available pretrained databases: {current_pretrained_db_list}")
+        os.exit(1)
+    else:
+        available_files_with_ani = [filename for filename in available_files if f"{args.ani_thresh}" in filename]
+        available_ani_thresh = [float(x.split('_')[-2]) for x in available_files]
+        if len(available_files_with_ani) == 0:
+            logger.error(f"No pretrained database found for {file_prefix}_{args.ani_thresh}_pretrained.zip on Zenodo. Now only support {available_ani_thresh} for {file_prefix}.")
+            os.exit(1)
+        else:
+            file_name_to_search = available_files_with_ani[0]    
+
+    ## Check if the file already exists
     output_path = os.path.join(args.outfolder, file_name_to_search)
     if os.path.exists(output_path):
         logger.info(f"File {output_path} already exists. Skipping download.")
-        unzip_file(output_path, args.outfolder)
         return
 
+    ## Download the file
     file_url = next((file.get('links', {}).get('self') for record in zenodo_records for file in record.get('files', [])
                      if file_name_to_search in file.get('key', '')), None)
 
-    if file_url and download_file(file_url, output_path):
-        unzip_file(output_path, args.outfolder)
+    if file_url:
+        download_file(file_url, output_path)
     else:
-        logger.warning(f"File '{file_name_to_search}' not found in Zenodo records.")
+        logger.error(f"File {file_name_to_search} not found on Zenodo.")
 
 
 if __name__ == "__main__":

From 5b8dfc040b8078f0e8454401fdee2fe91cad3e8f Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 13:38:06 -0500
Subject: [PATCH 60/76] add yacht download modules

---
 yacht/__init__.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/yacht/__init__.py b/yacht/__init__.py
index d2933db..5aa8c59 100644
--- a/yacht/__init__.py
+++ b/yacht/__init__.py
@@ -5,7 +5,9 @@
 from . import run_YACHT
 from . import make_training_data_from_sketches
 from . import standardize_yacht_output
-
+from . import download_demofiles
+from . import download_default_ref_db
+from . import download_pretrained_ref_db
 
 def main():
     parser = argparse.ArgumentParser(prog='yacht')
@@ -26,6 +28,25 @@ def main():
     standardize_yacht_output.add_arguments(convert_parser)
     convert_parser.set_defaults(func=standardize_yacht_output.main)
 
+    # Download command with submodules
+    download_parser = subparsers.add_parser('download', description='Download YACHT demo files or default raw reference databases or pretrained databases')
+    download_subparsers = download_parser.add_subparsers(dest='download_subcommand')
+
+    # Download demo files
+    demo_parser = download_subparsers.add_parser('demo', description='Download YACHT demo files')
+    download_demofiles.add_arguments(demo_parser)
+    demo_parser.set_defaults(func=download_demofiles.main)
+
+    # Download default raw reference databases
+    default_ref_db_parser = download_subparsers.add_parser('default_ref_db', description='Download default raw reference databases')
+    download_default_ref_db.add_arguments(default_ref_db_parser)
+    default_ref_db_parser.set_defaults(func=download_default_ref_db.main)
+
+    # Download pretrained databases
+    pretrained_ref_db_parser = download_subparsers.add_parser('pretrained_ref_db', description='Download pretrained databases')
+    download_pretrained_ref_db.add_arguments(pretrained_ref_db_parser)
+    pretrained_ref_db_parser.set_defaults(func=download_pretrained_ref_db.main)
+
     args = parser.parse_args()
     if 'func' in args:
         args.func(args)

From 809c009af06645866f4322f303c257e5fface1cc Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 14:26:15 -0500
Subject: [PATCH 61/76] update file permission

---
 demo/ref_genomes/GCF_018918045.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_018918095.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_018918125.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_018918185.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_018918235.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_018918285.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_018918345.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907163105.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907163115.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907163125.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907163135.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907164845.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907164905.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907165045.1_genomic.fna.gz | Bin
 demo/ref_genomes/GCF_907165215.1_genomic.fna.gz | Bin
 15 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918045.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918095.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918125.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918185.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918235.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918285.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_018918345.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907163105.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907163115.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907163125.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907163135.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907164845.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907164905.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907165045.1_genomic.fna.gz
 mode change 100755 => 100644 demo/ref_genomes/GCF_907165215.1_genomic.fna.gz

diff --git a/demo/ref_genomes/GCF_018918045.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918045.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_018918095.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918095.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_018918125.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918125.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_018918185.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918185.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_018918235.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918235.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_018918285.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918285.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_018918345.1_genomic.fna.gz b/demo/ref_genomes/GCF_018918345.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907163105.1_genomic.fna.gz b/demo/ref_genomes/GCF_907163105.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907163115.1_genomic.fna.gz b/demo/ref_genomes/GCF_907163115.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907163125.1_genomic.fna.gz b/demo/ref_genomes/GCF_907163125.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907163135.1_genomic.fna.gz b/demo/ref_genomes/GCF_907163135.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907164845.1_genomic.fna.gz b/demo/ref_genomes/GCF_907164845.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907164905.1_genomic.fna.gz b/demo/ref_genomes/GCF_907164905.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907165045.1_genomic.fna.gz b/demo/ref_genomes/GCF_907165045.1_genomic.fna.gz
old mode 100755
new mode 100644
diff --git a/demo/ref_genomes/GCF_907165215.1_genomic.fna.gz b/demo/ref_genomes/GCF_907165215.1_genomic.fna.gz
old mode 100755
new mode 100644

From 2b773a8070f09d9d0629d5f2626fdd653f17b532 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 14:27:06 -0500
Subject: [PATCH 62/76] reorganize functions

---
 yacht/download_default_ref_db.py    | 27 +++++++++++++++------------
 yacht/download_demofiles.py         | 16 ++++++++++------
 yacht/download_pretrained_ref_db.py | 27 +++++++++++++++------------
 3 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/yacht/download_default_ref_db.py b/yacht/download_default_ref_db.py
index 17e16b1..e724e7a 100644
--- a/yacht/download_default_ref_db.py
+++ b/yacht/download_default_ref_db.py
@@ -13,6 +13,14 @@
 # Import global variables
 from .utils import BASE_URL
 
+def add_arguments(parser):
+    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
+    parser.add_argument("--db_version", choices=["genbank-2022.03", "rs202", "rs207", "rs214"], required=True)
+    parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
+    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
+
 def generate_download_url(args):
     if args.database == "genbank":
         if args.db_version == "genbank-2022.03":
@@ -56,17 +64,7 @@ def create_output_folder(outfolder):
         logger.info(f"Creating output folder: {outfolder}")
         os.makedirs(outfolder)
 
-def main():
-    parser = argparse.ArgumentParser(description="Download genome sketches for YACHT from the specified source.")
-    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
-    parser.add_argument("--db_version", choices=["genbank-2022.03", "rs202", "rs207", "rs214"], required=True)
-    parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
-    parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
-    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
-    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
-
-    args = parser.parse_args()
-
+def main(args):
     ## Check if the input arguments are valid
     if args.database not in ["genbank", "gtdb"]:
         logger.error(f"Invalid database: {args.database}. Now only support genbank and gtdb.")
@@ -99,5 +97,10 @@ def main():
         logger.info(f"Downloaded successfully and saved to {output_path}")
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(
+        description="Download genome sketches for YACHT from the specified source.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    add_arguments(parser)
+    args = parser.parse_args()
+    main(args)
 
diff --git a/yacht/download_demofiles.py b/yacht/download_demofiles.py
index 58e6973..e48013a 100644
--- a/yacht/download_demofiles.py
+++ b/yacht/download_demofiles.py
@@ -12,6 +12,9 @@
 # Import global variables
 from .utils import GITHUB_API_URL, GITHUB_RAW_URL
 
+def add_arguments(parser):
+    parser.add_argument("--output", help="Output folder.", default="demo")
+
 def download_file(url, output_path):
     response = requests.get(url)
     if response.status_code == 200:
@@ -39,15 +42,16 @@ def download_demo_files(output_folder):
             os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
             download_file(GITHUB_RAW_URL.format(path=file_path), output_file_path)
 
-def main():
-    parser = argparse.ArgumentParser(description="Download YACHT demo files.")
-    parser.add_argument("--output", help="Output folder.", default="demo")
-    args = parser.parse_args()
-
+def main(args):
     logger.info(f"Starting download of YACHT demo files to {args.output}")
     download_demo_files(args.output)
     logger.info("Download completed.")
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(
+        description="Download YACHT demo files.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    add_arguments(parser)
+    args = parser.parse_args()
+    main(args)
 
diff --git a/yacht/download_pretrained_ref_db.py b/yacht/download_pretrained_ref_db.py
index def04db..e892a4b 100644
--- a/yacht/download_pretrained_ref_db.py
+++ b/yacht/download_pretrained_ref_db.py
@@ -13,6 +13,14 @@
 # Import global variables
 from .utils import ZENODO_COMMUNITY_URL
 
+def add_arguments(parser):
+    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
+    parser.add_argument("--db_version", choices=["genbank-2022.03", "rs214"], required=True)
+    parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--ani_thresh", type=float, choices=[0.80, 0.95, 0.995, 0.9995], required=True)
+    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
+
 def fetch_zenodo_records():
     logger.info("Fetching list of files from Zenodo community 'yacht'")
     try:
@@ -58,17 +66,7 @@ def create_output_folder(outfolder):
         logger.info(f"Creating output folder: {outfolder}")
         os.makedirs(outfolder)
 
-def main():
-    parser = argparse.ArgumentParser(description="Download pretrained models for YACHT from Zenodo.")
-    parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
-    parser.add_argument("--db_version", choices=["genbank-2022.03", "rs214"], required=True)
-    parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
-    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
-    parser.add_argument("--ani_thresh", type=float, choices=[0.80, 0.95, 0.995, 0.9995], required=True)
-    parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
-
-    args = parser.parse_args()
-
+def main(args):
     ## Check if the input arguments are valid
     if args.database not in ["genbank", "gtdb"]:
         logger.error(f"Invalid database: {args.database}. Now only support genbank and gtdb.")
@@ -138,4 +136,9 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(
+        description="Download pretrained models for YACHT from Zenodo.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    add_arguments(parser)
+    args = parser.parse_args()
+    main(args)

From 446a45c0dd08928c92653eb81bd24197e0947bc5 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 16:36:18 -0500
Subject: [PATCH 63/76] update parameters

---
 yacht/download_default_ref_db.py    |  2 +-
 yacht/download_demofiles.py         |  6 +++---
 yacht/download_pretrained_ref_db.py | 22 +++++++++++++++++-----
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/yacht/download_default_ref_db.py b/yacht/download_default_ref_db.py
index e724e7a..6d3321b 100644
--- a/yacht/download_default_ref_db.py
+++ b/yacht/download_default_ref_db.py
@@ -18,7 +18,7 @@ def add_arguments(parser):
     parser.add_argument("--db_version", choices=["genbank-2022.03", "rs202", "rs207", "rs214"], required=True)
     parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
     parser.add_argument("--gtdb_type", choices=[None, "reps", "full"], default=None)
-    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, default=31)
     parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
 
 def generate_download_url(args):
diff --git a/yacht/download_demofiles.py b/yacht/download_demofiles.py
index e48013a..7c33c3b 100644
--- a/yacht/download_demofiles.py
+++ b/yacht/download_demofiles.py
@@ -13,7 +13,7 @@
 from .utils import GITHUB_API_URL, GITHUB_RAW_URL
 
 def add_arguments(parser):
-    parser.add_argument("--output", help="Output folder.", default="demo")
+    parser.add_argument("--outfolder", help="Output folder.", default="demo")
 
 def download_file(url, output_path):
     response = requests.get(url)
@@ -43,8 +43,8 @@ def download_demo_files(output_folder):
             download_file(GITHUB_RAW_URL.format(path=file_path), output_file_path)
 
 def main(args):
-    logger.info(f"Starting download of YACHT demo files to {args.output}")
-    download_demo_files(args.output)
+    logger.info(f"Starting download of YACHT demo files to {args.outfolder}")
+    download_demo_files(args.outfolder)
     logger.info("Download completed.")
 
 if __name__ == "__main__":
diff --git a/yacht/download_pretrained_ref_db.py b/yacht/download_pretrained_ref_db.py
index e892a4b..eb1d21d 100644
--- a/yacht/download_pretrained_ref_db.py
+++ b/yacht/download_pretrained_ref_db.py
@@ -17,7 +17,7 @@ def add_arguments(parser):
     parser.add_argument("--database", choices=['genbank', 'gtdb'], required=True)
     parser.add_argument("--db_version", choices=["genbank-2022.03", "rs214"], required=True)
     parser.add_argument("--ncbi_organism", choices=["archaea", "bacteria", "fungi", "virus", "protozoa"], default=None)
-    parser.add_argument("--k", choices=[21, 31, 51], type=int, required=True)
+    parser.add_argument("--k", choices=[21, 31, 51], type=int, default=31)
     parser.add_argument("--ani_thresh", type=float, choices=[0.80, 0.95, 0.995, 0.9995], required=True)
     parser.add_argument("--outfolder", help="Output folder for downloaded files.", default=".")
 
@@ -66,6 +66,19 @@ def create_output_folder(outfolder):
         logger.info(f"Creating output folder: {outfolder}")
         os.makedirs(outfolder)
 
+def unzip_file(file_path, extract_to):
+    try:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+        logger.success(f"Extracted {file_path} to {extract_to}")
+    except zipfile.BadZipFile:
+        logger.error(f"Failed to unzip {file_path}. It might not be a zip file.")
+
+def create_output_folder(outfolder):
+    if not os.path.exists(outfolder):
+        logger.info(f"Creating output folder: {outfolder}")
+        os.makedirs(outfolder)
+
 def main(args):
     ## Check if the input arguments are valid
     if args.database not in ["genbank", "gtdb"]:
@@ -129,11 +142,10 @@ def main(args):
     file_url = next((file.get('links', {}).get('self') for record in zenodo_records for file in record.get('files', [])
                      if file_name_to_search in file.get('key', '')), None)
 
-    if file_url:
-        download_file(file_url, output_path)
+    if file_url and download_file(file_url, output_path):
+        unzip_file(output_path, args.outfolder)
     else:
-        logger.error(f"File {file_name_to_search} not found on Zenodo.")
-
+        logger.warning(f"File '{file_name_to_search}' not found in Zenodo records.")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(

From 5f10b3a56219753655196fb105a04c36b3ecdbac Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Thu, 7 Dec 2023 16:36:37 -0500
Subject: [PATCH 64/76] update README file

---
 README.md | 137 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 100 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index 8621355..35f894d 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ We provide a demo to show how to use YACHT. Please follow the command lines belo
 ```bash
 NUM_THREADS=64 # Adjust based on your machine's capabilities
 
-cd demo
+cd demo # the 'demo' folder can be downloaded via command 'yacht download demo' if it doesn't exist
 
 # build k-mer sketches for the query sample and ref genomes
 sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq
@@ -48,18 +48,23 @@ There will be an output EXCEL file `result.xlsx` recoding the presence of refere
     + [Using Mamba](#using-mamba)
     + [Using Docker](#using-docker)
 - [Usage](#usage)
+  * [YACHT Commands Overview](#yacht-commands-overview)
+  * [YACHT workflow](#yacht-workflow)
   * [Creating sketches of your reference database genomes](#creating-sketches-of-your-reference-database-genomes)
-    + [Some pre-trained reference databases available on Zenodo](#some-pre-trained-reference-databases-available-on-zenodo)
+    + [Automatic download of reference sketches](#automatic-download-of-reference-sketches)
+    + [Manual download of reference sketches](#manual-download-of-reference-sketches)
   * [Creating sketches of your sample](#creating-sketches-of-your-sample)
     + [Parameters](#parameters)
     + [Output](#output)
-  * [Creating a reference dictionary matrix](#creating-a-reference-dictionary-matrix)
-    + [Parameter](#parameter)
-    + [Output (to check after Chunyu's update)](#output-to-check-after-chunyus-update)
-  * [Run the YACHT algorithm](#run-the-yacht-algorithm)
-    + [Parameter](#parameter-1)
+  * [Preprocess the reference genomes](#preprocess-the-reference-genomes-yacht-train)
+    + [Parameters](#parameters-1)
     + [Output](#output-1)
+    + [Some pre-trained reference databases available on Zenodo](#some-pre-trained-reference-databases-available-on-zenodo)
+  * [Run the YACHT algorithm](#run-the-yacht-algorithm)
+    + [Parameters](#parameters-2)
+    + [Output](#output-2)
   * [Convert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn)](#convert-yacht-result-to-other-popular-output-formats-eg-cami-profiling-format-biom-format-graphplan)
+    + [Parameters](#parameters-3)
 
 
 ## Installation
@@ -81,10 +86,10 @@ conda install -c bioconda yacht
 ```
 
 ### Manual installation
-YACHT requires Python 3 or higher. We recommend using a virtual environment to ensure a clean and isolated workspace. This can be accomplished using either [Conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) or [Mamba](https://github.com/mamba-org/mamba) (a faster alternative to Conda).
+YACHT requires Python 3.6 or higher. We recommend using a virtual environment to ensure a clean and isolated workspace. This can be accomplished using either [Conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) or [Mamba](https://github.com/mamba-org/mamba) (a faster alternative to Conda).
 
 #### Using Conda
-To create and manage your virtual environment using Mamba, follow these steps:
+To create your Conda environment and install YACHT, follow these steps:
 
 ```bash
 # Clone the YACHT repository
@@ -106,36 +111,91 @@ If you prefer using Mamba instead of Conda, just simply repalce `conda` with `ma
 
 
 #### Using Docker
-If you prefer running YACHT on MacOS, you can choose to use docker with [Act](https://github.com/nektos/act)
-To run YACHT on docker, simply execute "act" from the main YACHT folder, or "act --container-architecture linux/amd64" if you are on mac.
+If you prefer running YACHT on MacOS, you can choose to use docker with [Act](https://github.com/nektos/act). To run YACHT on docker, simply execute "act" from the main YACHT folder, or "act --container-architecture linux/amd64" if you are on MacOS system.
 
 </br>
 
 ## Usage
 
-The workflow for YACHT is as follows: 
-
-1. **Creating Sketches of Your Reference Database Genomes and Your Sample:**
+### YACHT Commands Overview
+YACHT can be run via the command line `yacht <module>`. Now it has three four main modules: `download`, `train`, `run`, and `convert`.
+
+- The `download` module has three submodules: `demo`, `default_ref_db`, and `pretrained_ref_db`:
+  
+  + `demo` can automatically download the demo files to a specified folder:
+  ```bash
+  # Example
+  yacht download demo --outfolder ./demo
+  ```
+  + `default_ref_db` can automatically download pre-generated sketches of reference genomes from GTDB or GenBank as our input reference databases.
+  ```bash
+  # Example for downloading the k31 sketches of representative genomes of GTDB rs214 version 
+  yacht download default_ref_db --database gtdb --db_version rs214 --gtdb_type reps --k 31 --outfolder ./
+  ```
+  | Parameter         | Explanation                                                  |
+  | ----------------- | ------------------------------------------------------------ |
+  | database          | two options for default reference databases: 'genbank' or 'gtdb' |
+  | db_version        | the version of database, options: "genbank-2022.03", "rs202", "rs207", "rs214" |
+  | ncbi_organism     | the NCBI organism for the NCBI reference genome, options: "archaea", "bacteria", "fungi", "virus", "protozoa"|
+  | gtdb_type         | for GTDB database, chooses "representative" genome version or "full" genome version |
+  | k                 | the length of k-mer |
+  | outfolder         | the path to a folder where the downloaded file is expected to locate |
+
+
+  + `pretrained_ref_db` can automatically download our pre-trained reference genome database that can be directly used as input for `yacht train` module.
+  ```bash
+  # Example for downloading the pretrained reference database that was trained from GTDB rs214 representative genomes with k=31 and ani_threshold=0.9995
+  yacht download pretrained_ref_db --database gtdb --db_version rs214 --k 31 --ani_thresh 0.9995 --outfolder ./
+  ```
+  | Parameter         | Explanation                                                  |
+  | ----------------- | ------------------------------------------------------------ |
+  | database          | two options for default reference databases: 'genbank' or 'gtdb' |
+  | db_version        | the version of database, options: "genbank-2022.03", "rs214" |
+  | ncbi_organism     | the NCBI organism for the NCBI reference genome, options: "archaea", "bacteria", "fungi", "virus", "protozoa"|
+  | ani_thresh      | the cutoff by which two organisms are considered indistinguishable (default: 0.95) |
+  | k                 | the length of k-mer |
+  
+  | outfolder         | the path to a folder where the downloaded file is expected to locate |
+
+- The `train` module pre-reprocesses the given sketches of reference genomes (the `.zip` file) to identify and merge the "identical' genomes based on the given ANI threshold (e.g., --ani_threshold 0.95). For an example, please refer to the `yacht train` command in the "Quick start" section.
+
+- The `run` module runs the YACHT algorithm to detect the presence of reference genomes in a given sample. For an example, please refer to the `yacht run` command in the "Quick start" section.
+
+- The `convert` module can covert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn). For an example, please refer to the `yacht convert` command in the "Quick start" section.
+
+### YACHT workflow
+
+This section simply introduces the analysis workflow for YACHT:
+
+1. **Create Sketches of Your Reference Database Genomes and Your Sample:**
    - This involves using [sourmash](https://sourmash.readthedocs.io/en/latest/) to generate compact representations (sketches) of genomic data for efficient comparison and analysis.
-2. **Preprocessing the Reference Genomes:**
-   - This is the training step of YACHT, aiming to remove the "too similar" genomes based on Average Nucleotide Identity (`ANI`) using the `ani_thresh` parameter. 
+2. **Preprocess the Reference Genomes:**
+   - This is the training step of YACHT, aiming to identify and merge the "identical" genomes based on Average Nucleotide Identity (`ANI`) using the `ani_thresh` parameter. 
 3. **Run YACHT algorithm:** 
    - This step involves running the YACHT algorithm to detect the presence of reference genomes in your sample.
+4. **Convert YACHT result to other output formats**
+   - This step is optional if you prefer other output formats (e.g., CAMI profiling format, BIOM format) for the downstream analysis.
 
 
-**See below sections for more details of each step in the workflow.**
+For each step of this workflow, please see more detailed description in the sections below.
 
 </br>
 
 ### Creating sketches of your reference database genomes
 
-You will need a reference database in the form of [sourmash](https://sourmash.readthedocs.io/en/latest/) sketches of a collection of microbial genomes. There are a variety of pre-created databases available at: https://sourmash.readthedocs.io/en/latest/databases.html. Our code uses the "Zipfile collection" format, and we suggest using the [GTDB genomic representatives database](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip):
+You will need a reference database in the form of sourmash sketches of a collection of microbial genomes. There are a variety of pre-created databases available at: https://sourmash.readthedocs.io/en/latest/databases.html. Our code uses the "Zipfile collection" format, and we suggest using the [GTDB genomic representatives database](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip):
 
+#### Automatic download of reference sketches
+```bash
+yacht download default_ref_db --database gtdb --db_version rs214 --gtdb_type reps --k 31 --outfolder ./
+```
+
+#### Manual download of reference sketches
 ```bash
 wget https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip
 ```
 
-If you want to use a custom database, you will need to create a Sourmash sketch Zipfile collection from the FASTA/FASTQ files of your reference database genomes (see [Sourmash documentation](https://sourmash.readthedocs.io/en/latest/) for details). In brief, this can be accomplished via the following:
+If you want to use a custom database, you will need to create a Sourmash sketch Zipfile collection from the FASTA/FASTQ files of your reference database genomes (see [Sourmash documentation](https://sourmash.readthedocs.io/en/latest/) for details). In brief, this can be accomplished via the following commands:
 
 If you have a single FASTA file with _one genome_ per record:
 
@@ -158,22 +218,11 @@ sourmash sketch dna -f -p k=31,scaled=1000,abund *.fasta -o ../training_database
 # cd back to YACHT
 ```
 
-#### Some pre-trained reference databases available on Zenodo  
-
-For convenience, we have provided some pre-trained reference database for the GenBank and GTDB genomes on [Zenodo](https://zenodo.org/communities/yacht?q=&l=list&p=1&s=10&sort=newest). If any of them is suitable for your study, you can simply run the following command to download it and skip the training step below:
-```bash
-# remember to replace <zendo_id> and <file_name> for your case before running it
-curl --cookie zenodo-cookies.txt "https://zenodo.org/records/<zendo_id>/files/<file_name>?download=1" --output <file_name>
-
-# Example
-# curl --cookie zenodo-cookies.txt "https://zenodo.org/records/10113534/files/genbank-2022.03-archaea-k31_0.80_pretrained.zip?download=1" --output genbank-2022.03-archaea-k31_0.80_pretrained.zip
-```
-
 </br>
 
 ### Creating sketches of your sample
 
-You will then create a sketch of your sample metagenome, using the same k-mer size and scale factor
+Creating a sketch of your sample metagenome is an essential step in the YACHT workflow. This process involves using the same k-mer size and scale factor that were used for the reference database. You can use the following commands to implement this step:
 
 ```bash
 # For a single-end FASTA/Q file
@@ -208,9 +257,9 @@ In the two preceding steps, you will obtain a k-mer sketch file in zip format (i
 
 </br>
 
-### Preprocess the reference genomes (Training Step)
+### Preprocess the reference genomes (yacht train)
 
-##### Warning: the training process is time-consuming on large database
+**Warning: the training process is time-consuming on large database**
 
 In our benchmark with `GTDB representive genomes`, it takes `15 minutes` using `16 threads, 50GB of MEM` on a system equipped with a `3.5GHz AMD EPYC 7763 64-Core Processor`. You can use the pre-trained database (see [here](#some-pre-trained-reference-databases-available-on-zenodo)) to skip this step. The processing time can be significant when executed on GTDB all genomes OR with limited resources. If only part of genomes are needed, one may use `sourmash sig` command to extract signatures of interests only. 
 
@@ -222,7 +271,7 @@ The command `yacht train` extracts the sketches from the Zipfile-format referenc
 yacht train --ref_file gtdb-rs214-reps.k31.zip --ksize 31 --num_threads 32 --ani_thresh 0.95 --prefix 'gtdb_ani_thresh_0.95' --outdir ./
 ```
 
-#### Parameter
+#### Parameters
 
 The most important parameter of this script is `--ani_thresh`: this is average nucleotide identity (ANI) value equal to or below which two organisms are considered distinct. For example, if `--ani_thresh` is set to 0.95, then two organisms with ANI > 0.95 will be considered indistinguishable. For the organisms with ANI > 0.95, only the one with the largest number of unique kmers will be kept. If there is a tie in the number of unique kmers, one organism will be randomly selected. The default value of `--ani_thresh` is 0.95. The `--ani_thresh` value chosen here must match the one chosen for the YACHT algorithm (see below).  
 
@@ -243,10 +292,22 @@ The most important parameter of this script is `--ani_thresh`: this is average n
 | _manifest.tsv                         | A TSV file contains organisms and their relevant info after removing the similar ones |
 | _removed_orgs_to_corr_orgas_mapping.tsv   | A TSV file with two columns: removed organism names ('removed_org') and their similar genomes ('corr_orgs')| 
 
+#### Some pre-trained reference databases available on Zenodo  
+
+For convenience, we have provided some pre-trained reference database for the GenBank and GTDB genomes on [Zenodo](https://zenodo.org/communities/yacht?q=&l=list&p=1&s=10&sort=newest). If any of them is suitable for your study, you can simply run the following command to download it and skip the training step below:
+```bash
+# remember to replace <zendo_id> and <file_name> for your case before running it
+curl --cookie zenodo-cookies.txt "https://zenodo.org/records/<zendo_id>/files/<file_name>?download=1" --output <file_name>
+
+# Example
+# curl --cookie zenodo-cookies.txt "https://zenodo.org/records/10113534/files/genbank-2022.03-archaea-k31_0.80_pretrained.zip?download=1" --output genbank-2022.03-archaea-k31_0.80_pretrained.zip
+```
+
+**Please note that if you plan to use these pre-trained reference databases, once you download and unzip it. You need to change the paths within the config json file (e.g., gtdb-rs214-reps.k31_0.9995_config.json) to the correct paths in your machine.**
 
 </br>
 
-### Run the YACHT algorithm
+### Run the YACHT algorithm (yacht run)
 
 After this, you are ready to perform the hypothesis test via `yacht run` for each organism in your reference database. This can be accomplished with something like:
 
@@ -254,7 +315,7 @@ After this, you are ready to perform the hypothesis test via `yacht run` for eac
 yacht run --json 'gtdb_ani_thresh_0.95_config.json' --sample_file 'sample.sig.zip' --num_threads 32 --keep_raw --significance 0.99 --min_coverage_list 1 0.5 0.1 0.05 0.01 --out ./result.xlsx
 ```
 
-#### Parameter
+#### Parameters
 
 The `--significance` parameter is basically akin to your confidence level: how sure do you want to be that the organism is present? Higher leads to more false negatives, lower leads to more false positives. 
 
@@ -287,7 +348,7 @@ Other interesting columns include:
 
 </br>
 
-### Convert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn)
+### Convert YACHT result to other popular output formats (yacht convert)
 
 When we get the EXCEL result file from run_YACHT.py, you can run `yacht convert` to covert the YACHT result to other popular output formats (Currently, only `cami`, `biom`, `graphplan` are supported).
 
@@ -298,6 +359,8 @@ Then you are ready to run `yacht convert` with something like:
 yacht convert --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --genome_to_taxid 'genome_to_taxid.tsv' --mode 'cami' --sample_name 'MySample' --outfile_prefix 'cami_result' --outdir ./
 ```
 
+#### Parameters
+
 | Parameter         | Explanation                                                  |
 | ----------------- | ------------------------------------------------------------ |
 | --yacht_output    | the path to the output excel file generated by `run_YACHT.py` |

From f8b2c42e8ff4e4646d1efce5726437912bcd76c6 Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:00:36 -0500
Subject: [PATCH 65/76] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 35f894d..37843ee 100644
--- a/README.md
+++ b/README.md
@@ -254,7 +254,6 @@ In the two preceding steps, you will obtain a k-mer sketch file in zip format (i
 | sample.sig.zip            | K-mer sketch file for your input sample                     |
 
 
-
 </br>
 
 ### Preprocess the reference genomes (yacht train)

From c765b4590f4b0d7d588503e245b592998534d9d3 Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:05:42 -0500
Subject: [PATCH 66/76] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 37843ee..3a2e0f3 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,6 @@ There will be an output EXCEL file `result.xlsx` recoding the presence of refere
   * [Convert YACHT result to other popular output formats (e.g., CAMI profiling format, BIOM format, GraphPlAn)](#convert-yacht-result-to-other-popular-output-formats-eg-cami-profiling-format-biom-format-graphplan)
     + [Parameters](#parameters-3)
 
-
 ## Installation
 
 **Please note YACHT does not currently support MacOS. However, we are actively working on developing compatibility for this operating system and hope to have it available soon. During this time, we provide a docker container (see `using docker` section below) for those who need to run YACHT on MacOS.**

From 6755ef388bc8cbde620a1da31952372b554336b3 Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:11:18 -0500
Subject: [PATCH 67/76] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3a2e0f3..de838a5 100644
--- a/README.md
+++ b/README.md
@@ -219,6 +219,7 @@ sourmash sketch dna -f -p k=31,scaled=1000,abund *.fasta -o ../training_database
 
 </br>
 
+
 ### Creating sketches of your sample
 
 Creating a sketch of your sample metagenome is an essential step in the YACHT workflow. This process involves using the same k-mer size and scale factor that were used for the reference database. You can use the following commands to implement this step:

From 2078a0681e08130547ae5e493267c5d9395e324c Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:15:19 -0500
Subject: [PATCH 68/76] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index de838a5..6f5b7d0 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,6 @@ pip install .
 #### Using Mamba
 If you prefer using Mamba instead of Conda, just simply repalce `conda` with `mamba` in the above commands.
 
-
 #### Using Docker
 If you prefer running YACHT on MacOS, you can choose to use docker with [Act](https://github.com/nektos/act). To run YACHT on docker, simply execute "act" from the main YACHT folder, or "act --container-architecture linux/amd64" if you are on MacOS system.
 

From e8299ae5d1ed046a7a1fb5c9829e6c6fb2831ece Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:21:33 -0500
Subject: [PATCH 69/76] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 6f5b7d0..6597767 100644
--- a/README.md
+++ b/README.md
@@ -174,7 +174,6 @@ This section simply introduces the analysis workflow for YACHT:
 4. **Convert YACHT result to other output formats**
    - This step is optional if you prefer other output formats (e.g., CAMI profiling format, BIOM format) for the downstream analysis.
 
-
 For each step of this workflow, please see more detailed description in the sections below.
 
 </br>

From 524317c567b6131f18a146a7a4f6ee9f62046dfe Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:28:41 -0500
Subject: [PATCH 70/76] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 6597767..add59a0 100644
--- a/README.md
+++ b/README.md
@@ -368,4 +368,3 @@ yacht convert --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --gen
 | --outfile_prefix  | the prefix of the output file. Default: result | 
 | --outdir          | the path to output directory where the results will be genreated |
 
-

From f1e5599e8718f2b75d82ca3c3530b0c38e1ecd78 Mon Sep 17 00:00:00 2001
From: mfl15 <144844529+mfl15@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:34:39 -0500
Subject: [PATCH 71/76] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index add59a0..ddb977d 100644
--- a/README.md
+++ b/README.md
@@ -368,3 +368,5 @@ yacht convert --yacht_output 'result.xlsx' --sheet_name 'min_coverage0.01' --gen
 | --outfile_prefix  | the prefix of the output file. Default: result | 
 | --outdir          | the path to output directory where the results will be genreated |
 
+
+

From 7a7940f068a1a7dc7066c26ff52cf4f90f6dde7d Mon Sep 17 00:00:00 2001
From: Maksym Lupei <55431857+mlupei@users.noreply.github.com>
Date: Thu, 7 Dec 2023 17:39:11 -0500
Subject: [PATCH 72/76] Create codecov.yml

---
 codecov.yml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 codecov.yml

diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000..ea2cdd7
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,8 @@
+coverage:
+  status:
+    patch:
+      default:
+        informational: true
+    project:
+      default:
+        informational: true

From 788e68e722377f52c7c8a0e1f72b9c3a984ee8bf Mon Sep 17 00:00:00 2001
From: Mohsen Taheri <mohsen.taheri.1989@gmail.com>
Date: Thu, 7 Dec 2023 19:12:51 -0500
Subject: [PATCH 73/76] remove meta.yaml file

---
 meta.yaml | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 meta.yaml

diff --git a/meta.yaml b/meta.yaml
deleted file mode 100644
index 68c07a9..0000000
--- a/meta.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-package:
-  name: yacht
-  version: 1.0
-
-source:
-  path: .
-
-build:
-  noarch: python
-  script: "{{ PYTHON }} -m pip install . --ignore-installed --no-cache-dir -vvv"
-  script_env:
-    - PYTHON
-
-requirements:
-  build:
-    - python
-    - setuptools
-  host:
-    - python >3.6
-    - pip
-  run:
-    - python >3.6
-    - gcc
-    - sourmash >=4.8.3,<5
-    - rust
-    - scipy
-    - numpy
-    - pandas
-    - scikit-learn
-    - codecov
-    - pytest
-    - pytest-cov
-    - loguru
-    - maturin >=1,<2
-    - tqdm
-    - pip
-    - biom-format
-    - pytaxonkit
-    - sourmash_plugin_branchwater
-    - openpyxl
-
-about:
-  home: https://github.com/KoslickiLab/YACHT
-  license: MIT License
-  summary: YACHT is a mathematically rigorous hypothesis test for the presence or absence of organisms in a metagenomic sample, based on average nucleotide identity (ANI).

From b6ed7b9b54dcf3820b5467714b8bc0aaf7ae0c27 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 8 Dec 2023 11:33:11 -0500
Subject: [PATCH 74/76] fixed the test bug issues

---
 tests/test_utils.py | 62 +++------------------------------------------
 1 file changed, 4 insertions(+), 58 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index daa928f..f973381 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -125,50 +125,8 @@ def test_6(self):
         indices = utils.get_column_indices(column_name_to_index)
         assert indices[4] is None
 
-class TestGetCamiProfile(unittest.TestCase):
-    def test_1(self):
-        file_path = os.path.join(os.path.dirname(__file__), 'testdata/sample_cami.txt')
-        with open(file_path, 'r') as file:
-            sample_cami_content = file.readlines()
-
-        profiles = utils.get_cami_profile(sample_cami_content)
-
-        expected_header = {
-            'SAMPLEID': 'CAMI_LOW_S001',
-            'VERSION': '0.9.1',
-            'RANKS': 'superkingdom|phylum|class|order|family|genus|species|strain',
-            'TAXONOMYID': 'ncbi-taxonomy_DATE',
-            '__PROGRAM__': 'unknown'
-        }
-
-        assert len(profiles) == 1
-        sample_id, header, profile = profiles[0]
-
-        assert sample_id == "CAMI_LOW_S001"
-        assert header == expected_header
-        assert len(profile) == 2044
-
-        prediction1 = profile[0]
-        assert prediction1.rank == 'superkingdom'
-        assert prediction1.taxid == '2157'
-        assert math.isclose(prediction1.percentage, 0.029528, abs_tol=1e-6)
-        assert prediction1.taxpath == '2157'
-        assert prediction1.taxpathsn == 'Archaea'
-
-        prediction2 = profile[1]
-        assert prediction2.rank == 'superkingdom'
-        assert prediction2.taxid == '2'
-        assert math.isclose(prediction2.percentage, 29.183763, rel_tol=1e-6)
-        assert prediction2.taxpath == '2'
-        assert prediction2.taxpathsn == 'Bacteria'
-
-
 class TestStandardizeOutput(unittest.TestCase):
     def test_everything_exists(self):
-        script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-        script_dir = os.path.join(script_dir, 'srcs')
-        script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py')
-        assert os.path.exists(script_full_path)
 
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
         assert os.path.exists(yacht_output)
@@ -179,16 +137,12 @@ def test_everything_exists(self):
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
         assert os.path.exists(outdir)
 
-        cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         res = subprocess.run(cmd, shell=True, check=True)
         assert res.returncode == 0
         assert os.path.exists(os.path.join(outdir, 'cami_result.cami'))
 
     def test_wrong_yacht_output(self):
-        script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-        script_dir = os.path.join(script_dir, 'srcs')
-        script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py')
-        assert os.path.exists(script_full_path)
 
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx')
         assert not os.path.exists(yacht_output)
@@ -199,15 +153,11 @@ def test_wrong_yacht_output(self):
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
         assert os.path.exists(outdir)
 
-        cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
             res = subprocess.run(cmd, shell=True, check=True)
 
     def test_wrong_genome_to_taxid(self):
-        script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-        script_dir = os.path.join(script_dir, 'srcs')
-        script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py')
-        assert os.path.exists(script_full_path)
 
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
         assert os.path.exists(yacht_output)
@@ -218,15 +168,11 @@ def test_wrong_genome_to_taxid(self):
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
         assert os.path.exists(outdir)
 
-        cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
             res = subprocess.run(cmd, shell=True, check=True)
 
     def test_wrong_outdir(self):
-        script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-        script_dir = os.path.join(script_dir, 'srcs')
-        script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py')
-        assert os.path.exists(script_full_path)
 
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
         assert os.path.exists(yacht_output)
@@ -242,7 +188,7 @@ def test_wrong_outdir(self):
             pass
         assert not os.path.exists(outdir)
 
-        cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         res = subprocess.run(cmd, shell=True, check=True)
         assert res.returncode == 0
         assert os.path.exists(outdir)

From 43d199669eafeb89216881addbadbfe845c75f20 Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 8 Dec 2023 11:39:39 -0500
Subject: [PATCH 75/76] remove the duplicated tests

---
 tests/integration_tests.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/tests/integration_tests.py b/tests/integration_tests.py
index 08c2d9d..6386cca 100644
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -71,20 +71,3 @@ def test_run_yacht():
 
     assert exists('result.xlsx')
 
-def test_run_yacht_and_standardizer():
-    cmd = "cd demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq"
-    res = subprocess.run(cmd, shell=True, check=True)
-    assert res.returncode == 0
-    cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists"
-    res = subprocess.run(cmd, shell=True, check=True)
-    assert res.returncode == 0
-    cmd = "cd demo; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./"
-    res = subprocess.run(cmd, shell=True, check=True)
-    assert res.returncode == 0
-    cmd = "cd demo; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx"
-    res = subprocess.run(cmd, shell=True, check=True)
-    assert res.returncode == 0
-    cmd = "cd demo; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./"
-    res = subprocess.run(cmd, shell=True, check=True)
-    assert res.returncode == 0
-    assert exists('demo/cami_result.cami')

From 763a87e95799f737793b21d1797235089a094efc Mon Sep 17 00:00:00 2001
From: Chunyu Ma <machunyu4402@hotmail.com>
Date: Fri, 8 Dec 2023 11:52:24 -0500
Subject: [PATCH 76/76] fixed a potential bug

---
 tests/test_standardize_output.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/tests/test_standardize_output.py b/tests/test_standardize_output.py
index 8e30203..b66385e 100644
--- a/tests/test_standardize_output.py
+++ b/tests/test_standardize_output.py
@@ -2,10 +2,6 @@
 import os
 import subprocess
 
-def get_script_path():
-    script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-    return os.path.join(script_dir, 'srcs', 'standardize_yacht_output.py')
-
 def assert_file_exists(file_path):
     assert os.path.exists(file_path)
 
@@ -26,20 +22,17 @@ def cleanup_outdir(outdir):
     assert res.returncode == 0
 
 class TestScript(unittest.TestCase):
-    def setUp(self):
-        self.script_full_path = get_script_path()
 
     def test_everything_exists(self):
         yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx')
         genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
 
-        assert_file_exists(self.script_full_path)
         assert_file_exists(yacht_output)
         assert_file_exists(genome_to_taxid)
         assert_file_exists(outdir)
 
-        cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         res = subprocess.run(cmd, shell=True, check=True)
         assert res.returncode == 0
         assert_file_exists(os.path.join(outdir, 'cami_result.cami'))
@@ -49,12 +42,11 @@ def test_wrong_yacht_output(self):
         genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
 
-        assert_file_exists(self.script_full_path)
         assert_file_not_exists(yacht_output)
         assert_file_exists(genome_to_taxid)
         assert_file_exists(outdir)
 
-        cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
             res = subprocess.run(cmd, shell=True, check=True)
 
@@ -63,12 +55,11 @@ def test_wrong_genome_to_taxid(self):
         genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv')
         outdir = os.path.join(os.path.dirname(__file__), 'testdata')
 
-        assert_file_exists(self.script_full_path)
         assert_file_exists(yacht_output)
         assert_file_not_exists(genome_to_taxid)
         assert_file_exists(outdir)
 
-        cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         with self.assertRaises(subprocess.CalledProcessError):
             res = subprocess.run(cmd, shell=True, check=True)
 
@@ -77,12 +68,11 @@ def test_wrong_outdir(self):
         genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv')
         outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting')
 
-        assert_file_exists(self.script_full_path)
         assert_file_exists(yacht_output)
         assert_file_exists(genome_to_taxid)
         create_outdir(outdir)
 
-        cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
+        cmd = f"yacht convert --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}"
         res = subprocess.run(cmd, shell=True, check=True)
         assert res.returncode == 0
         assert_file_exists(outdir)