diff --git a/tests/integration_tests.py b/tests/integration_tests.py index 4d15e35..14eaa8e 100644 --- a/tests/integration_tests.py +++ b/tests/integration_tests.py @@ -22,15 +22,15 @@ def make_train_fasta(): def test_sourmash_sketch_command(): with tempfile.TemporaryDirectory() as tmp_dir: make_train_fasta() - + fasta_file = "example.fasta" output_file = os.path.join(tmp_dir, "training_database.sig.zip") cmd = [ "sourmash", "sketch", "dna", "-f", "-p", "k=31,scaled=1000,abund", "--singleton", fasta_file, "-o", output_file ] - + subprocess.run(cmd, check=True) - + assert os.path.isfile(output_file) def test_make_training_data_from_sketches(): @@ -63,11 +63,29 @@ def test_make_training_data_from_sketches(): config = json.load(f) assert config['ksize'] == int(ksize) assert config['ani_thresh'] == float(ani_thresh) - + def test_run_yacht(): cmd = "python run_YACHT.py --json gtdb_ani_thresh_0.95_config.json --sample_file 'tests/testdata/sample.sig.zip' --significance 0.99 --min_coverage_list 1 0.6 0.2 0.1" - + res = subprocess.run(cmd, shell=True, check=True) assert res.returncode == 0 assert exists('result.xlsx') + +def test_run_yacht_and_standardizer(): + cmd = "cd demo; sourmash sketch dna -f -p k=31,scaled=1000,abund -o sample.sig.zip query_data/query_data.fq" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + cmd = "cd demo; sourmash sketch fromfile ref_paths.csv -p dna,k=31,scaled=1000,abund -o ref.sig.zip --force-output-already-exists" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + cmd = "cd demo; python ../make_training_data_from_sketches.py --force --ref_file ref.sig.zip --ksize 31 --num_threads 1 --ani_thresh 0.95 --prefix 'demo_ani_thresh_0.95' --outdir ./" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + cmd = "cd demo; python ../run_YACHT.py --json demo_ani_thresh_0.95_config.json --sample_file sample.sig.zip --significance 0.99 --num_threads 1 --min_coverage_list 1 0.6 0.2 0.1 --out result.xlsx" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + cmd = "cd demo; python ../srcs/standardize_yacht_output.py --yacht_output result.xlsx --sheet_name min_coverage0.2 --genome_to_taxid toy_genome_to_taxid.tsv --mode cami --sample_name 'MySample' --outfile_prefix cami_result --outdir ./" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + assert exists('demo/cami_result.cami') diff --git a/tests/test_standardize_output.py b/tests/test_standardize_output.py new file mode 100644 index 0000000..8e30203 --- /dev/null +++ b/tests/test_standardize_output.py @@ -0,0 +1,93 @@ +import unittest +import os +import subprocess + +def get_script_path(): + script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + return os.path.join(script_dir, 'srcs', 'standardize_yacht_output.py') + +def assert_file_exists(file_path): + assert os.path.exists(file_path) + +def assert_file_not_exists(file_path): + assert not os.path.exists(file_path) + +def create_outdir(outdir): + cmd = f'rm -rf {outdir}' + try: + subprocess.run(cmd, shell=True, check=True) + except: + pass + assert not os.path.exists(outdir) + +def cleanup_outdir(outdir): + cmd = f'rm -rf {outdir}' + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + +class TestScript(unittest.TestCase): + def setUp(self): + self.script_full_path = get_script_path() + + def test_everything_exists(self): + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx') + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv') + outdir = os.path.join(os.path.dirname(__file__), 'testdata') + + assert_file_exists(self.script_full_path) + assert_file_exists(yacht_output) + assert_file_exists(genome_to_taxid) + assert_file_exists(outdir) + + cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + assert_file_exists(os.path.join(outdir, 'cami_result.cami')) + + def test_wrong_yacht_output(self): + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx') + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv') + outdir = os.path.join(os.path.dirname(__file__), 'testdata') + + assert_file_exists(self.script_full_path) + assert_file_not_exists(yacht_output) + assert_file_exists(genome_to_taxid) + assert_file_exists(outdir) + + cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + with self.assertRaises(subprocess.CalledProcessError): + res = subprocess.run(cmd, shell=True, check=True) + + def test_wrong_genome_to_taxid(self): + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx') + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv') + outdir = os.path.join(os.path.dirname(__file__), 'testdata') + + assert_file_exists(self.script_full_path) + assert_file_exists(yacht_output) + assert_file_not_exists(genome_to_taxid) + assert_file_exists(outdir) + + cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + with self.assertRaises(subprocess.CalledProcessError): + res = subprocess.run(cmd, shell=True, check=True) + + def test_wrong_outdir(self): + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx') + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv') + outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting') + + assert_file_exists(self.script_full_path) + assert_file_exists(yacht_output) + assert_file_exists(genome_to_taxid) + create_outdir(outdir) + + cmd = f"python {self.script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + assert_file_exists(outdir) + + cleanup_outdir(outdir) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index f23a541..cc4e686 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,6 +10,14 @@ sys.path.append(project_path) from srcs import utils import sourmash +import unittest +import math +import json +import pytest +import tempfile +import gzip +import sys +import shutil def to_testing_data(file): @@ -55,9 +63,194 @@ def test_load_signature_with_ksize3(): assert type(sig) == sourmash.signature.FrozenSourmashSignature assert sig.jaccard(sig) == 1.0 +class TestGetColumnIndices(unittest.TestCase): + def test_1(self): + column_name_to_index = { + "TAXID": 1, + "RANK": 0, + "PERCENTAGE": 2, + "TAXPATH": 3, + "TAXPATHSN": 4 + } + indices = utils.get_column_indices(column_name_to_index) + assert indices == (0, 1, 2, 3, 4) + + def test_2(self): + column_name_to_index = { + "RANK": 0, + "PERCENTAGE": 2, + "TAXPATH": 3, + "TAXPATHSN": 4 + } + with self.assertRaises(RuntimeError): + utils.get_column_indices(column_name_to_index) + + def test_3(self): + column_name_to_index = { + "TAXID": 1, + "PERCENTAGE": 2, + "TAXPATH": 3, + "TAXPATHSN": 4 + } + with self.assertRaises(RuntimeError): + utils.get_column_indices(column_name_to_index) + + def test_4(self): + column_name_to_index = { + "TAXID": 1, + "RANK": 0, + "TAXPATH": 3, + "TAXPATHSN": 4 + } + with self.assertRaises(RuntimeError): + utils.get_column_indices(column_name_to_index) + + def test_5(self): + column_name_to_index = { + "TAXID": 1, + "RANK": 0, + "PERCENTAGE": 2, + "TAXPATHSN": 4 + } + with self.assertRaises(RuntimeError): + utils.get_column_indices(column_name_to_index) + + def test_6(self): + column_name_to_index = { + "TAXID": 1, + "RANK": 0, + "PERCENTAGE": 2, + "TAXPATH": 3 + } + indices = utils.get_column_indices(column_name_to_index) + assert indices[4] is None + +class TestGetCamiProfile(unittest.TestCase): + def test_1(self): + file_path = os.path.join(os.path.dirname(__file__), 'testdata/sample_cami.txt') + with open(file_path, 'r') as file: + sample_cami_content = file.readlines() + + profiles = utils.get_cami_profile(sample_cami_content) + + expected_header = { + 'SAMPLEID': 'CAMI_LOW_S001', + 'VERSION': '0.9.1', + 'RANKS': 'superkingdom|phylum|class|order|family|genus|species|strain', + 'TAXONOMYID': 'ncbi-taxonomy_DATE', + '__PROGRAM__': 'unknown' + } + + assert len(profiles) == 1 + sample_id, header, profile = profiles[0] + + assert sample_id == "CAMI_LOW_S001" + assert header == expected_header + assert len(profile) == 2044 + + prediction1 = profile[0] + assert prediction1.rank == 'superkingdom' + assert prediction1.taxid == '2157' + assert math.isclose(prediction1.percentage, 0.029528, abs_tol=1e-6) + assert prediction1.taxpath == '2157' + assert prediction1.taxpathsn == 'Archaea' + + prediction2 = profile[1] + assert prediction2.rank == 'superkingdom' + assert prediction2.taxid == '2' + assert math.isclose(prediction2.percentage, 29.183763, rel_tol=1e-6) + assert prediction2.taxpath == '2' + assert prediction2.taxpathsn == 'Bacteria' + + +class TestStandardizeOutput(unittest.TestCase): + def test_everything_exists(self): + script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + script_dir = os.path.join(script_dir, 'srcs') + script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py') + assert os.path.exists(script_full_path) + + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx') + assert os.path.exists(yacht_output) + + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv') + assert os.path.exists(genome_to_taxid) + + outdir = os.path.join(os.path.dirname(__file__), 'testdata') + assert os.path.exists(outdir) + + cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + assert os.path.exists(os.path.join(outdir, 'cami_result.cami')) + + def test_wrong_yacht_output(self): + script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + script_dir = os.path.join(script_dir, 'srcs') + script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py') + assert os.path.exists(script_full_path) + + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result_nonexisting.xlsx') + assert not os.path.exists(yacht_output) + + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv') + assert os.path.exists(genome_to_taxid) + + outdir = os.path.join(os.path.dirname(__file__), 'testdata') + assert os.path.exists(outdir) + + cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + with self.assertRaises(subprocess.CalledProcessError): + res = subprocess.run(cmd, shell=True, check=True) + + def test_wrong_genome_to_taxid(self): + script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + script_dir = os.path.join(script_dir, 'srcs') + script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py') + assert os.path.exists(script_full_path) + + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx') + assert os.path.exists(yacht_output) + + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid_nonexisting.tsv') + assert not os.path.exists(genome_to_taxid) + + outdir = os.path.join(os.path.dirname(__file__), 'testdata') + assert os.path.exists(outdir) + + cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + with self.assertRaises(subprocess.CalledProcessError): + res = subprocess.run(cmd, shell=True, check=True) + + def test_wrong_outdir(self): + script_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + script_dir = os.path.join(script_dir, 'srcs') + script_full_path = os.path.join(script_dir, 'standardize_yacht_output.py') + assert os.path.exists(script_full_path) + + yacht_output = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/result.xlsx') + assert os.path.exists(yacht_output) + genome_to_taxid = os.path.join(os.path.dirname(__file__), 'testdata/standardize_output_testdata/toy_genome_to_taxid.tsv') + assert os.path.exists(genome_to_taxid) + outdir = os.path.join(os.path.dirname(__file__), 'testdata_nonexisting') + cmd = 'rm -rf ' + outdir + try: + subprocess.run(cmd, shell=True, check=True) + except: + pass + assert not os.path.exists(outdir) + cmd = f"python {script_full_path} --yacht_output {yacht_output} --sheet_name min_coverage0.2 --genome_to_taxid {genome_to_taxid} --outfile_prefix cami_result --outdir {outdir}" + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 + assert os.path.exists(outdir) + cmd = 'rm -rf ' + outdir + res = subprocess.run(cmd, shell=True, check=True) + assert res.returncode == 0 +if __name__ == '__main__': + unittest.main() diff --git a/tests/testdata/standardize_output_testdata/cami_result.cami b/tests/testdata/standardize_output_testdata/cami_result.cami new file mode 100644 index 0000000..a5ca06d --- /dev/null +++ b/tests/testdata/standardize_output_testdata/cami_result.cami @@ -0,0 +1,21 @@ +@SampleID:MySample +@Version:0.9.1 +@Ranks:superkingdom|phylum|class|order|family|genus|species + +@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE +2 superkingdom 2 Bacteria 100.0 +1239 phylum 2|1239 Bacteria|Bacillota 100.0 +186801 class 2|1239|186801 Bacteria|Bacillota|Clostridia 100.0 +186802 order 2|1239|186801|186802 Bacteria|Bacillota|Clostridia|Eubacteriales 100.0 +216572 family 2|1239|186801|186802|216572 Bacteria|Bacillota|Clostridia|Eubacteriales|Oscillospiraceae 40.0 +186803 family 2|1239|186801|186802|186803 Bacteria|Bacillota|Clostridia|Eubacteriales|Lachnospiraceae 20.0 +186806 family 2|1239|186801|186802|186806 Bacteria|Bacillota|Clostridia|Eubacteriales|Eubacteriaceae 40.0 +1017280 genus 2|1239|186801|186802|216572|1017280 Bacteria|Bacillota|Clostridia|Eubacteriales|Oscillospiraceae|Pseudoflavonifractor 20.0 +35829 genus 2|1239|186801|186802|216572|35829 Bacteria|Bacillota|Clostridia|Eubacteriales|Oscillospiraceae|Acetivibrio 20.0 +572511 genus 2|1239|186801|186802|186803|572511 Bacteria|Bacillota|Clostridia|Eubacteriales|Lachnospiraceae|Blautia 20.0 +1730 genus 2|1239|186801|186802|186806|1730 Bacteria|Bacillota|Clostridia|Eubacteriales|Eubacteriaceae|Eubacterium 40.0 +2841525 species 2|1239|186801|186802|216572|1017280|2841525 Bacteria|Bacillota|Clostridia|Eubacteriales|Oscillospiraceae|Pseudoflavonifractor|Pseudoflavonifractor sp. MSJ-30 20.0 +2841523 species 2|1239|186801|186802|216572|35829|2841523 Bacteria|Bacillota|Clostridia|Eubacteriales|Oscillospiraceae|Acetivibrio|Acetivibrio sp. MSJd-27 20.0 +2841517 species 2|1239|186801|186802|186803|572511|2841517 Bacteria|Bacillota|Clostridia|Eubacteriales|Lachnospiraceae|Blautia|Blautia sp. MSJ-19 20.0 +2841519 species 2|1239|186801|186802|186806|1730|2841519 Bacteria|Bacillota|Clostridia|Eubacteriales|Eubacteriaceae|Eubacterium|Eubacterium sp. MSJ-21 20.0 +2841513 species 2|1239|186801|186802|186806|1730|2841513 Bacteria|Bacillota|Clostridia|Eubacteriales|Eubacteriaceae|Eubacterium|Eubacterium sp. MSJ-13 20.0 diff --git a/tests/testdata/standardize_output_testdata/result.xlsx b/tests/testdata/standardize_output_testdata/result.xlsx new file mode 100644 index 0000000..44da4cd Binary files /dev/null and b/tests/testdata/standardize_output_testdata/result.xlsx differ diff --git a/tests/testdata/standardize_output_testdata/toy_genome_to_taxid.tsv b/tests/testdata/standardize_output_testdata/toy_genome_to_taxid.tsv new file mode 100644 index 0000000..7c7e53f --- /dev/null +++ b/tests/testdata/standardize_output_testdata/toy_genome_to_taxid.tsv @@ -0,0 +1,16 @@ +genome_id taxid +GCF_018918045.1 2841525 +GCF_018918095.1 2841523 +GCF_018918125.1 2841517 +GCF_018918185.1 2841519 +GCF_018918235.1 2841513 +GCF_018918285.1 2841512 +GCF_018918345.1 2841509 +GCF_907163105.1 2004710 +GCF_907163115.1 316 +GCF_907163125.1 2004710 +GCF_907163135.1 28220 +GCF_907164845.1 2822368 +GCF_907164905.1 2822344 +GCF_907165045.1 2823330 +GCF_907165215.1 76633 diff --git a/tests/testdata/standardize_output_testdata/~$result.xlsx b/tests/testdata/standardize_output_testdata/~$result.xlsx new file mode 100644 index 0000000..9065a50 Binary files /dev/null and b/tests/testdata/standardize_output_testdata/~$result.xlsx differ diff --git a/tests/unittests.py b/tests/unittests.py index 60d6e3c..1d819cf 100644 --- a/tests/unittests.py +++ b/tests/unittests.py @@ -35,54 +35,7 @@ def test_check_file_existence(): non_existing_file = os.path.join(tmp_dir, "non_existing_file.txt") with pytest.raises(ValueError, match=dont_exist): check_file_existence(non_existing_file, dont_exist) - -def test_get_column_indices(): - column_name_to_index = { - "TAXID": 1, - "RANK": 0, - "PERCENTAGE": 2, - "TAXPATH": 3, - "TAXPATHSN": 4 - } - indices = get_column_indices(column_name_to_index) - assert indices == (0, 1, 2, 3, 4) - -def test_get_cami_profile(): - file_path = os.path.join(os.path.dirname(__file__), 'testdata/sample_cami.txt') - with open(file_path, 'r') as file: - sample_cami_content = file.readlines() - - profiles = get_cami_profile(sample_cami_content) - - expected_header = { - 'SAMPLEID': 'CAMI_LOW_S001', - 'VERSION': '0.9.1', - 'RANKS': 'superkingdom|phylum|class|order|family|genus|species|strain', - 'TAXONOMYID': 'ncbi-taxonomy_DATE', - '__PROGRAM__': 'unknown' - } - - assert len(profiles) == 1 - sample_id, header, profile = profiles[0] - - assert sample_id == "CAMI_LOW_S001" - assert header == expected_header - assert len(profile) == 2044 - - prediction1 = profile[0] - assert prediction1.rank == 'superkingdom' - assert prediction1.taxid == '2157' - assert math.isclose(prediction1.percentage, 0.029528, abs_tol=1e-6) - assert prediction1.taxpath == '2157' - assert prediction1.taxpathsn == 'Archaea' - - prediction2 = profile[1] - assert prediction2.rank == 'superkingdom' - assert prediction2.taxid == '2' - assert math.isclose(prediction2.percentage, 29.183763, rel_tol=1e-6) - assert prediction2.taxpath == '2' - assert prediction2.taxpathsn == 'Bacteria' - + def test_get_alt_mut_rate(): nu = 10 thresh = 5 @@ -91,7 +44,7 @@ def test_get_alt_mut_rate(): result = get_alt_mut_rate(nu, thresh, ksize, significance) expected_result = 0.047902071844405425 assert math.isclose(result, expected_result, rel_tol=1e-6, abs_tol=1e-6) - + def test_get_alt_mut_rate_zero_nu(): nu = 0 thresh = 5 @@ -109,10 +62,10 @@ def test_get_alt_mut_rate_large_thresh(): result = get_alt_mut_rate(nu, thresh, ksize, significance) expected_result = -1 assert result == expected_result - + def test_get_info_from_single_sig(): sig_list_file = 'gtdb_ani_thresh_0.95_intermediate_files/training_sig_files.txt' - + with open(sig_list_file, 'r') as file: lines = file.readlines() if lines: @@ -147,7 +100,7 @@ def test_get_info_from_single_sig(): def test_collect_signature_info(): num_threads = 2 ksize = 0 - path_to_temp_dir = 'gtdb_ani_thresh_0.95_intermediate_files/' + path_to_temp_dir = 'gtdb_ani_thresh_0.95_intermediate_files/' result = collect_signature_info(num_threads, ksize, path_to_temp_dir) @@ -171,16 +124,16 @@ def test_run_multisearch(): result = run_multisearch(num_threads, ani_thresh, ksize, scale, path_to_temp_dir) for signature_name, expected_related_genomes in expected_results.items(): - assert signature_name in result - actual_related_genomes = result[signature_name] + assert signature_name in result + actual_related_genomes = result[signature_name] assert set(actual_related_genomes) == set(expected_related_genomes) - + def test_single_hyp_test(): exclusive_hashes_info_org = (100, 90) ksize = 31 - + result = single_hyp_test(exclusive_hashes_info_org, ksize) - + in_sample_est, p_val, num_exclusive_kmers, num_exclusive_kmers_coverage, num_matches, \ acceptance_threshold_with_coverage, actual_confidence_with_coverage, alt_confidence_mut_rate_with_coverage = result @@ -193,6 +146,6 @@ def test_single_hyp_test(): assert isinstance(actual_confidence_with_coverage, float) assert isinstance(alt_confidence_mut_rate_with_coverage, float) - + if __name__ == '__main__': - pytest.main() \ No newline at end of file + pytest.main()