From 33aea22f8aaef206b60534dfda9f78bf31414fb3 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Thu, 19 Jan 2023 12:23:25 +0100 Subject: [PATCH 01/34] [FIX] Speed comparison. --- README.md | 7 ++++--- src/snakemake/speed/Snakefile | 16 ++++++++-------- src/snakemake/speed/plot_speed.py | 32 +++++++++++++++---------------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index a4865e5..c048636 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,8 @@ The first number is the minimum, then follows the mean, the variance and the max **Note:** Currently, there are two implementations of the strobemers supported. The original one from [Kristoffer Sahlin](https://github.com/ksahlin/strobemers) and the one here presented. The one here presented is more comparable to the other methods used here, because they are based on the same hash functions. Therefore, these strobemers are used for almost every evaluation metric. However, currently the implementation is slower than the one from Sahlin, that is why both implementations can be used with speed. -For the original implementation, add the flag `--original` and note that for the original implementation, only randstrobemers are supported for irder 2 and 3, minstrobemers and hybridstrobemers only support order 2. Furthermore, the flags `--w-min` and `--w-max` have different meanings between the original implementation and the implementation here. +For the original implementation, add the flag `--original` and note that for the original implementation, only randstrobemers are supported for order 2 and 3, minstrobemers and hybridstrobemers only support order 2. Furthermore, the flags `--w-min` and `--w-max` have different meanings between the original implementation and the implementation here. -`w-min` in the implementation from minions is the distance between the first strobe to second strobe. While for the original implementation, it is the starting position in the sequence of the window that is considered for the second strobe. -`w-max` in the implementation from minions is the window length that should be considered for every strobe besides the first one. All strobes need to be completely inside this window length to be considered. While for the original implementation, it is the position in the sequence until which a strobe that is considered has to start. Therefore, for a strobemer with a strobe length of 8, `w-min` of 0 and `w-max` of 15 in the minion implementation would equal a `w-min` of 9 and `w-max` of 17. For more details, please read the documentation for both implementations. +`w-min` in the implementation from minions is the distance between the first strobe to second strobe. While for the original implementation, it is the starting position in the sequence of the window that is considered for the second strobe. Therefore, the call with original should always add (k+1) to `w-min` compared to the minion implementation. + +`w-max` in the implementation from minions is the window length that should be considered for every strobe besides the first one. All strobes need to be completely inside this window length to be considered. While for the original implementation, it is the position in the sequence until which a strobe that is considered has to end. Therefore, for a strobemer with a strobe length of 8, `w-min` of 0 and `w-max` of 15 in the minion implementation would equal a `w-min` of 9 and `w-max` of 24. For more details, please read the documentation for both implementations. diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index ad1b128..11b332d 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -25,15 +25,15 @@ rule plot: [shape + "_kmer_hash_30_speed.out" for shape in ["0", "805287931", "1004529051"]], [shape + "_kmer_hash_32_speed.out" for shape in ["0", "3169577727", "241004285"]], # 4 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in [9,12,15]], + ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in [9,12,15]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in [9,12,15]] + ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in [9,12,15]] shell: "python3 plot_speed.py" rule download_example_Data: diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 939d255..1fd6057 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -22,14 +22,14 @@ def read_file(results, files): gapped4_kmers = read_file([], [shapes4[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) shapes8 = ["51755","975475","13954519","241004285","241004285"] gapped8_kmers = read_file([], [shapes8[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) -minstrobemers = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17,2)]) -hybridstrobemers = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in [9,12,15]]) -minstrobemers8 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17,2)]) -hybridstrobemers8 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17,2)]) +hybridstrobemers = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17,2)]) +randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17,2)]) +randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers8 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17,2)]) +hybridstrobemers8 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17,2)]) +randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17,2)]) +randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in [9,12,15]]) # Plot comparison between k-mers fig = plt.figure() @@ -137,14 +137,14 @@ def read_file(results, files): pos_order3 = [1.25,4.25,7.25] strobe_range = [k for k in range(8,17)] -minstrobemers = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) -randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) -randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in [9,12,15]]) -minstrobemers8 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers8 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in strobe_range]) -randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in strobe_range]) -randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in strobe_range]) +randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in strobe_range]) +randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers8 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers8 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in strobe_range]) +randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in strobe_range]) +randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in [9,12,15]]) fig = plt.figure() X = np.arange(len(k_size)) From b0499f6f4d22ecefd4e1436eee3b9276f5abe374 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 20 Jan 2023 13:22:49 +0100 Subject: [PATCH 02/34] [MISC] Rename library to original. --- src/main.cpp | 2 +- src/snakemake/speed/Snakefile | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7b42d7c..4127a76 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -223,7 +223,7 @@ int speed(seqan3::argument_parser & parser) std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); - parser.add_flag(args.lib_implementation, '\0', "library", "Set, if you want to use the strobemer implementation from Sahlin."); + parser.add_flag(args.lib_implementation, '\0', "original", "Set, if you want to use the strobemer implementation from Sahlin."); read_range_arguments_minimiser(parser, args); read_range_arguments_strobemers(parser, args); diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index 11b332d..d85c5ce 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -56,7 +56,7 @@ rule speed_minstrobemer: output: "minstrobemers_{kmer_size}_2_{wmin}_{wmax}_speed.out" shell: - "minions speed --method strobemer --min -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --library --order 2 {input}" + "minions speed --method strobemer --min -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order 2 {input}" rule speed_hybridstrobemer: input: @@ -64,7 +64,7 @@ rule speed_hybridstrobemer: output: "hybridstrobemers_{kmer_size}_2_{wmin}_{wmax}_speed.out" shell: - "minions speed --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --library --order 2 {input}" + "minions speed --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order 2 {input}" rule speed_randstrobemer: input: @@ -72,4 +72,4 @@ rule speed_randstrobemer: output: "randstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_speed.out" shell: - "minions speed --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --library --order {wildcards.order} {input}" + "minions speed --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order {wildcards.order} {input}" From e9e650ec0d3e3359219141c9e64bf7f6f66dabf5 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 20 Jan 2023 16:18:21 +0100 Subject: [PATCH 03/34] [MISC] Change input to unique. --- include/compare.h | 9 ++--- src/compare.cpp | 11 +++--- src/main.cpp | 23 ++++------- test/cli/minions_unique_test.cpp | 65 +------------------------------- 4 files changed, 19 insertions(+), 89 deletions(-) diff --git a/include/compare.h b/include/compare.h index baa388c..bde205f 100644 --- a/include/compare.h +++ b/include/compare.h @@ -141,9 +141,8 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen */ void do_speed(std::vector sequence_files, range_arguments & args); -/*! \brief Function that calculates the uniqueness of submers in given sequence files. - * \param sequence_files A vector of sequence files. - * \param method_name The name of the method. - * \param args The arguments about the view to be used. +/*! \brief Function that calculates the uniqueness of submers in given files. + * \param input_files A vector of input files. An input file is a count file obtained by counts. + * \param oname The name of the output file. */ -void unique(std::vector sequence_files, std::string method_name, range_arguments & args); +void unique(std::vector input_files, std::filesystem::path oname); diff --git a/src/compare.cpp b/src/compare.cpp index 7a2189d..d2b3dc8 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -536,16 +536,17 @@ uint64_t count_singletons(robin_hood::unordered_node_map & h return singletons; } -void unique(std::vector sequence_files, std::string method_name, range_arguments & args) +// Input files should be the output files from count +void unique(std::vector input_files, std::filesystem::path oname) { std::ifstream infile; std::ofstream outfile; - outfile.open(std::string{args.path_out} + method_name + "_unique.out"); + outfile.open(oname); - for (int i = 0; i < sequence_files.size(); ++i) + for (int i = 0; i < input_files.size(); ++i) { robin_hood::unordered_node_map hash_table{}; - infile.open(std::string{args.path_out} + method_name + "_"+ std::string{sequence_files[i].stem()} + "_counts.out", std::ios::binary); + infile.open(input_files[i], std::ios::binary); uint64_t submer; uint16_t submer_count; while(infile.read((char*)&submer, sizeof(submer))) @@ -555,7 +556,7 @@ void unique(std::vector sequence_files, std::string metho } infile.close(); - outfile << sequence_files[i].stem() << "\t" << (count_singletons(hash_table) * 100.0)/hash_table.size() << "\n"; + outfile << input_files[i].stem() << "\t" << (count_singletons(hash_table) * 100.0)/hash_table.size() << "\n"; } outfile.close(); diff --git a/src/main.cpp b/src/main.cpp index 4127a76..28cd96c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -247,24 +247,16 @@ int speed(seqan3::argument_parser & parser) int unique(seqan3::argument_parser & parser) { - range_arguments args{}; - std::vector sequence_files{}; - parser.info.short_description = "Calculates the percentage of unique submers of a method for the given sequence files."; - parser.add_positional_option(sequence_files, - "Please provide at least one sequence file."); - all_arguments(parser, args); - std::string method{}; - parser.add_option(method, '\0', "method", "Pick your method.", - seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); - parser.add_flag(args.lib_implementation, '\0', "library", "Set, if you want to use the strobemer implementation from Sahlin."); - - read_range_arguments_minimiser(parser, args); - read_range_arguments_strobemers(parser, args); + std::filesystem::path oname{}; + std::vector input_files{}; + parser.info.short_description = "Calculates the percentage of unique submers of a method for the given files."; + parser.add_positional_option(input_files, + "Please provide at least one input file. An input file is a count file obtained by minions count."); + parser.add_option(oname, 'o', "out", "Name of the output file."); try { parser.parse(); - parsing(args); } catch (seqan3::argument_parser_error const & ext) // catch user errors { @@ -272,8 +264,7 @@ int unique(seqan3::argument_parser & parser) return -1; } - string_to_methods(method, args.name); - unique(sequence_files, create_name(args), args); + unique(input_files, oname); return 0; } diff --git a/test/cli/minions_unique_test.cpp b/test/cli/minions_unique_test.cpp index f68c9f0..a33b9e3 100644 --- a/test/cli/minions_unique_test.cpp +++ b/test/cli/minions_unique_test.cpp @@ -5,72 +5,11 @@ TEST_F(cli_test, no_options) cli_test_result result = execute_app("minions unique"); std::string expected { - "minions-unique - Calculates the percentage of unique submers of a method for the given sequence files.\n" - "======================================================================================================\n" + "minions-unique - Calculates the percentage of unique submers of a method for the given files.\n" + "=============================================================================================\n" " Try -h or --help for more information.\n" }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); } - -TEST_F(cli_test, with_argument) -{ - cli_test_result result = execute_app("minions unique --method kmer -k 19", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{}); -} - -TEST_F(cli_test, minimiser) -{ - cli_test_result result = execute_app("minions unique --method minimiser -k 19 -w 19 ", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{}); -} - -TEST_F(cli_test, modmer) -{ - cli_test_result result = execute_app("minions unique --method modmer -k 19 -w 2 ", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{}); -} - -TEST_F(cli_test, strobemer) -{ - cli_test_result result = execute_app("minions unique --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --rand", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{}); -} - -TEST_F(cli_test, hybridstrobemer) -{ - cli_test_result result = execute_app("minions unique --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --hybrid", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{}); -} - -TEST_F(cli_test, minstrobemer) -{ - cli_test_result result = execute_app("minions unique --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --min", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{}); -} - -TEST_F(cli_test, wrong_method) -{ - cli_test_result result = execute_app("minions unique --method submer -k 19", data("example1.fasta")); - std::string expected - { - "Error. Incorrect command line input for unique. Validation failed " - "for option --method: Value submer is not one of [kmer,minimiser,modmer,strobemer].\n" - }; - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, expected); -} From 2236f9b41238f64c9759605a06cebb1a33ea958a Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 20 Jan 2023 20:51:53 +0100 Subject: [PATCH 04/34] [MISC] Speed up randstrobemers and minstrobemers. --- include/minstrobe.hpp | 79 +++++++++++++++++++++++++++++++++++++----- include/randstrobe.hpp | 50 ++++++++++++++------------ 2 files changed, 97 insertions(+), 32 deletions(-) diff --git a/include/minstrobe.hpp b/include/minstrobe.hpp index efa5d72..4c0d33b 100644 --- a/include/minstrobe.hpp +++ b/include/minstrobe.hpp @@ -317,7 +317,6 @@ class minstrobe_view::basic_iterator "Please choose a bigger window size greater than 0."}; fill_window(); - determine_value(); } //!\} @@ -621,6 +620,12 @@ class minstrobe_view::basic_iterator //!\brief The number of elements in a window. size_t window_size{}; + //!\brief The offset of the minstrobe. + size_t minstrobe_position_offset{}; + + //!\brief The offset of the minstrobe for order 3. + size_t minstrobe_position_offset3{}; + //!\brief Advances the window of the iterators to the next position. void advance_windows() { @@ -681,15 +686,13 @@ class minstrobe_view::basic_iterator { window_values3.push_back(*third_iterator); } - } - //!\brief Determine minstrobe value based on the contents of the window and the first_iterator. - void determine_value() - { - auto minstrobe_it =std::ranges::min_element(window_values, std::less_equal{}); + auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); + minstrobe_position_offset = std::distance(std::begin(window_values), minstrobe_it); if constexpr(order_3) { auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); + minstrobe_position_offset3 = std::distance(std::begin(window_values3), minstrobe_it3); minstrobe_value = {*first_iterator, *minstrobe_it, *minstrobe_it3}; } else @@ -715,7 +718,6 @@ class minstrobe_view::basic_iterator window_values3.clear(); } fill_window(); - determine_value(); } /*!\brief Calculates the next minstrobe value. @@ -734,16 +736,45 @@ class minstrobe_view::basic_iterator return; } + minstrobe_value[0] = *first_iterator; window_values.pop_front(); window_values.push_back(*second_iterator); + if (minstrobe_position_offset == 0) + { + auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); + minstrobe_value[1] = *minstrobe_it; + minstrobe_position_offset = std::distance(std::begin(window_values), minstrobe_it) + 1; + } + + if (*second_iterator <= minstrobe_value[1]) + { + minstrobe_value[1] = *second_iterator; + minstrobe_position_offset = window_values.size(); + } + if constexpr(order_3) { window_values3.pop_front(); window_values3.push_back(*third_iterator); + + if (minstrobe_position_offset3 == 0) + { + auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); + minstrobe_value[2] = *minstrobe_it3; + minstrobe_position_offset3 = std::distance(std::begin(window_values3), minstrobe_it3) + 1; + } + + if (*third_iterator <= minstrobe_value[2]) + { + minstrobe_value[2] = *third_iterator; + minstrobe_position_offset3 = window_values3.size(); + } + + --minstrobe_position_offset3; } - determine_value(); + --minstrobe_position_offset; } /*!\brief Calculates the previous minstrobe value. @@ -763,16 +794,46 @@ class minstrobe_view::basic_iterator } retreat_windows(); + + minstrobe_value[0] = *first_iterator; window_values.pop_back(); window_values.push_front(*second_iterator_back); + if (minstrobe_position_offset == (window_values.size() - 1)) + { + auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); + minstrobe_value[1] = *minstrobe_it; + minstrobe_position_offset = std::distance(std::begin(window_values), minstrobe_it); + } + + if (*second_iterator_back < minstrobe_value[1]) + { + minstrobe_value[1] = *second_iterator_back; + minstrobe_position_offset = 0; + } + if constexpr(order_3) { window_values3.pop_back(); window_values3.push_front(*third_iterator_back); + + if (minstrobe_position_offset3 == (window_values3.size() - 1)) + { + auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); + minstrobe_value[2] = *minstrobe_it3; + minstrobe_position_offset3 = std::distance(std::begin(window_values3), minstrobe_it3); + } + + if (*third_iterator_back <= minstrobe_value[2]) + { + minstrobe_value[2] = *third_iterator_back; + minstrobe_position_offset3 = 0; + } + + ++minstrobe_position_offset3; } - determine_value(); + ++minstrobe_position_offset; } }; diff --git a/include/randstrobe.hpp b/include/randstrobe.hpp index c7a4235..84106d9 100644 --- a/include/randstrobe.hpp +++ b/include/randstrobe.hpp @@ -632,21 +632,23 @@ class randstrobe_view::basic_iterator //!\brief Fills window and determines randstrobe value. void fill_window() { - //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current randstrobe. - std::deque window_values{}; + //!\brief Stores minimum for order 2. + value_t minimum{}; - //!\brief Stored values per window for order 3. - std::deque window_values3{}; + //!\brief Stores minimum for order 3. + value_t minimum3{}; - //!\brief Stored hash values per window. It is necessary to store them, because a shift can remove the current randstrobe. - std::deque hash_values{}; + //!\brief Stores minimum hash value for order 2. + value_t minimum_hash{}; - //!\brief Stored hash values per window for order 3. - std::deque hash_values3{}; + //!\brief Stores minimum hash value for order 3. + value_t minimum_hash3{}; second_iterator = first_iterator; std::ranges::advance(second_iterator, window_dist); second_iterator_back = second_iterator; + minimum = *second_iterator; + minimum_hash = linking(*first_iterator, *second_iterator); if constexpr(order_3) { @@ -658,34 +660,36 @@ class randstrobe_view::basic_iterator for (int i = 1u; i < window_size; ++i) { - hash_values.push_back(linking(*first_iterator, *second_iterator)); - window_values.push_back(*second_iterator); ++second_iterator; + value_t new_value = linking(*first_iterator, *second_iterator); + if (new_value <= minimum_hash) + { + minimum_hash = new_value; + minimum = *second_iterator; + } } - hash_values.push_back(linking(*first_iterator, *second_iterator)); - window_values.push_back(*second_iterator); - - auto randstrobe_it = std::ranges::min_element(hash_values, std::less_equal{}); - auto index = std::distance(std::begin(hash_values), randstrobe_it); if constexpr(order_3) { + minimum3 = *third_iterator; + minimum_hash3 = linking(*first_iterator, minimum, *third_iterator); + for (int i = 1u; i < window_size; ++i) { - hash_values3.push_back(linking(*first_iterator, window_values[index], *third_iterator)); - window_values3.push_back(*third_iterator); ++third_iterator; + value_t new_value = linking(*first_iterator, minimum, *third_iterator); + if (new_value <= minimum_hash3) + { + minimum_hash3 = new_value; + minimum3 = *third_iterator; + } } - hash_values3.push_back(linking(*first_iterator, window_values[index], *third_iterator)); - window_values3.push_back(*third_iterator); - auto randstrobe_it3 = std::ranges::min_element(hash_values3, std::less_equal{}); - auto index3 = std::distance(std::begin(hash_values3), randstrobe_it3); - randstrobe_value = {*first_iterator, window_values[index], window_values3[index3]}; + randstrobe_value = {*first_iterator, minimum, minimum3}; } else { - randstrobe_value = {*first_iterator, window_values[index]}; + randstrobe_value = {*first_iterator, minimum}; } } From 027e1fdd9f1d8c3cc871c284ba60298aa936818e Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 20 Jan 2023 21:47:40 +0100 Subject: [PATCH 05/34] [MISC] Remove bidirectionality and random access from strobes. --- include/hybridstrobe.hpp | 252 +------------------------------ include/minstrobe.hpp | 267 +-------------------------------- include/randstrobe.hpp | 224 +-------------------------- test/api/hybridstrobe_test.cpp | 34 +---- test/api/minstrobe_test.cpp | 23 +-- test/api/randstrobe_test.cpp | 23 +-- 6 files changed, 27 insertions(+), 796 deletions(-) diff --git a/include/hybridstrobe.hpp b/include/hybridstrobe.hpp index 6762865..0a5f886 100644 --- a/include/hybridstrobe.hpp +++ b/include/hybridstrobe.hpp @@ -159,54 +159,10 @@ class hybridstrobe_view : public std::ranges::view_interface) - //!\endcond { return {}; } - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() noexcept - //!\cond - requires std::ranges::random_access_range - //!\endcond - { - // If the underlying range supports random access, then we can just jump to the end. - return begin()+size(); - - } - - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() const noexcept - //!\cond - requires const_iterable_range && std::ranges::random_access_range - //!\endcond - { - // If the underlying range supports random access, then we can just jump to the end. - return begin()+size(); - } - /*!\brief Returns the size of the range, if the underlying range is a std::ranges::sized_range. * \returns Size of range. */ @@ -251,7 +207,7 @@ class hybridstrobe_view::basic_iterator //!\brief Reference to `value_type`. using reference = value_type; //!\brief Tag this class as a bidirectional iterator. - using iterator_category = std::random_access_iterator_tag; + using iterator_category = std::forward_iterator_tag; //!\brief Tag this class as a bidirectional iterator. using iterator_concept = iterator_category; //!\} @@ -275,8 +231,6 @@ class hybridstrobe_view::basic_iterator first_iterator{std::move(it.first_iterator)}, second_iterator{std::move(it.second_iterator)}, third_iterator{std::move(it.third_iterator)}, - second_iterator_back{std::move(it.second_iterator_back)}, - third_iterator_back{std::move(it.third_iterator_back)}, urng_sentinel{std::move(it.urng_sentinel)}, window_dist{std::move(it.window_dist)}, window_size{std::move(it.window_size)} @@ -299,10 +253,7 @@ class hybridstrobe_view::basic_iterator first_iterator{first_iterator}, second_iterator{first_iterator}, third_iterator{first_iterator}, - second_iterator_back{first_iterator}, - third_iterator_back{first_iterator}, urng_sentinel{std::move(urng_sentinel)}, - urng_first{first_iterator}, window_dist{window_dist}, window_size{window_size} { @@ -438,113 +389,6 @@ class hybridstrobe_view::basic_iterator return tmp; } - /*!\brief Pre-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator & operator--() noexcept - //!\cond - requires std::ranges::bidirectional_range - //!\endcond - { - prev_hybridstrobe(); - return *this; - } - - /*!\brief Post-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator operator--(int) noexcept - //!\cond - requires std::ranges::bidirectional_range - //!\endcond - { - basic_iterator tmp{*this}; - prev_hybridstrobe(); - return tmp; - } - /*!\brief Forward this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator & operator+=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - move_forward_backward(skip); - return *this; - } - - /*!\brief Forward copy of this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator operator+(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp += skip; - } - - /*!\brief Non-member operator+ delegates to non-friend operator+. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend basic_iterator operator+(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it + skip; - } - - /*!\brief Decrement iterator by `skip`. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator & operator-=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - move_forward_backward(-skip); - return *this; - } - - /*!\anchor basic_iterator_operator-decrement - * \brief Return decremented copy of this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator operator-(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp -= skip; - } - - /*!\brief Non-member operator- delegates to non-friend operator-. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend basic_iterator operator-(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it - skip; - } - - /*!\anchor basic_iterator_operator-difference - * \brief Return offset between two iterator's positions. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend difference_type operator-(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return static_cast(lhs.first_iterator - rhs.first_iterator); - } - /*!\brief Return offset between remote sentinel's position and this. * \attention This function is only available if sentinel_t and urng_t model std::sized_sentinel_for. */ @@ -573,17 +417,6 @@ class hybridstrobe_view::basic_iterator return static_cast(lhs.second_iterator - rhs); } - /*!\brief Move the iterator by a given offset and return the corresponding hash value. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - reference operator[](difference_type const n) const - //!\cond - requires std::random_access_iterator - //!\endcond - { - return *(*this + n); - } - //!\brief Return the hybridstrobe. value_type operator*() const noexcept { @@ -603,15 +436,6 @@ class hybridstrobe_view::basic_iterator //!\brief Iterator to the third strobe of hybridstrobe, if order is 3. urng_iterator_t third_iterator{}; - //!\brief Iterator to the left most value of the window and hence the second strobe of minstrobe for bidirectionality. - urng_iterator_t second_iterator_back{}; - - //!\brief Iterator to the left most value of the second window and hence the third strobe of minstrobe for bidirectionality, if order is 3. - urng_iterator_t third_iterator_back{}; - - //!\brief Iterator to first element in range. Needed for bidirectionality. - urng_iterator_t urng_first{}; - //!\brief Iterator to last element in range. urng_sentinel_t urng_sentinel{}; @@ -638,26 +462,10 @@ class hybridstrobe_view::basic_iterator { ++first_iterator; ++second_iterator; - ++second_iterator_back; if constexpr(order_3) { ++third_iterator; - ++third_iterator_back; - } - } - - //!\brief Retreat the window of the iterators to the next position. - void retreat_windows() - { - --first_iterator; - --second_iterator; - --second_iterator_back; - - if constexpr(order_3) - { - --third_iterator; - --third_iterator_back; } } @@ -666,14 +474,11 @@ class hybridstrobe_view::basic_iterator { second_iterator = first_iterator; std::ranges::advance(second_iterator, window_dist); - second_iterator_back = second_iterator; if constexpr(order_3) { third_iterator = second_iterator; - third_iterator_back = third_iterator; std::ranges::advance(third_iterator, window_size + window_dist - 1); - third_iterator_back = third_iterator; } for (int i = 1u; i < window_size; ++i) @@ -711,26 +516,6 @@ class hybridstrobe_view::basic_iterator } } - /*!\brief Increments or decrements iterator by `skip`. - * \param skip Amount to increment. - * \attention This function is only available if `urng_iterator_t` models std::random_access_iterator. - */ - void move_forward_backward(difference_type const skip) - //!\cond - requires std::random_access_iterator - //!\endcond - { - std::ranges::advance(first_iterator, skip); - window_values.clear(); - - if constexpr(order_3) - { - window_values3.clear(); - } - fill_window(); - determine_value(); - } - /*!\brief Calculates the next hybridstrobe value. * \details * For the following windows, we remove the first window value (is now not in window_values) and add the new @@ -739,11 +524,11 @@ class hybridstrobe_view::basic_iterator void next_hybridstrobe() { advance_windows(); - if (second_iterator_back == urng_sentinel) + if (second_iterator == urng_sentinel) return; if constexpr(order_3) { - if (third_iterator_back == urng_sentinel) + if (third_iterator == urng_sentinel) return; } @@ -758,35 +543,6 @@ class hybridstrobe_view::basic_iterator determine_value(); } - - /*!\brief Calculates the previous hybridstrobe value. - * \details - * For the following windows, we remove the last window value (is now not in window_values) and add the new - * value that results from the window shifting. - */ - void prev_hybridstrobe() - requires std::ranges::bidirectional_range - { - if (second_iterator_back == urng_first) - return; - if constexpr(order_3) - { - if (third_iterator_back == urng_first) - return; - } - - retreat_windows(); - window_values.pop_back(); - window_values.push_front(*second_iterator_back); - - if constexpr(order_3) - { - window_values3.pop_back(); - window_values3.push_front(*third_iterator_back); - } - - determine_value(); - } }; //!\brief A deduction guide for the view class template. @@ -872,7 +628,7 @@ namespace seqan3::views * | std::ranges::input_range | *required* | *preserved* | * | std::ranges::forward_range | *required* | *preserved* | * | std::ranges::bidirectional_range | | *preserved* | - * | std::ranges::random_access_range | | *preserved* | + * | std::ranges::random_access_range | | *lost* | * | std::ranges::contiguous_range | | *lost* | * | | | | * | std::ranges::viewable_range | *required* | *guaranteed* | diff --git a/include/minstrobe.hpp b/include/minstrobe.hpp index 4c0d33b..a72dcf5 100644 --- a/include/minstrobe.hpp +++ b/include/minstrobe.hpp @@ -159,54 +159,10 @@ class minstrobe_view : public std::ranges::view_interface * No-throw guarantee. */ sentinel end() const - //!\cond - requires (!std::ranges::random_access_range) - //!\endcond { return {}; } - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() noexcept - //!\cond - requires std::ranges::random_access_range - //!\endcond - { - // If the underlying range supports random access, then we can just jump to the end. - return begin()+size(); - - } - - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() const noexcept - //!\cond - requires const_iterable_range && std::ranges::random_access_range - //!\endcond - { - // If the underlying range supports random access, then we can just jump to the end. - return begin()+size(); - } - /*!\brief Returns the size of the range, if the underlying range is a std::ranges::sized_range. * \returns Size of range. */ @@ -251,7 +207,7 @@ class minstrobe_view::basic_iterator //!\brief Reference to `value_type`. using reference = value_type; //!\brief Tag this class as a bidirectional iterator. - using iterator_category = std::random_access_iterator_tag; + using iterator_category = std::forward_iterator_tag; //!\brief Tag this class as a bidirectional iterator. using iterator_concept = iterator_category; //!\} @@ -275,8 +231,6 @@ class minstrobe_view::basic_iterator first_iterator{std::move(it.first_iterator)}, second_iterator{std::move(it.second_iterator)}, third_iterator{std::move(it.third_iterator)}, - second_iterator_back{std::move(it.second_iterator_back)}, - third_iterator_back{std::move(it.third_iterator_back)}, urng_sentinel{std::move(it.urng_sentinel)}, window_dist{std::move(it.window_dist)}, window_size{std::move(it.window_size)} @@ -299,10 +253,7 @@ class minstrobe_view::basic_iterator first_iterator{first_iterator}, second_iterator{first_iterator}, third_iterator{first_iterator}, - second_iterator_back{first_iterator}, - third_iterator_back{first_iterator}, urng_sentinel{std::move(urng_sentinel)}, - urng_first{first_iterator}, window_dist{window_dist}, window_size{window_size} { @@ -433,111 +384,6 @@ class minstrobe_view::basic_iterator return tmp; } - /*!\brief Pre-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator & operator--() noexcept - //!\cond - requires std::ranges::bidirectional_range - //!\endcond - { - prev_minstrobe(); - return *this; - } - - /*!\brief Post-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator operator--(int) noexcept - //!\cond - requires std::ranges::bidirectional_range - //!\endcond - { - basic_iterator tmp{*this}; - prev_minstrobe(); - return tmp; - } - /*!\brief Forward this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator & operator+=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - move_forward_backward(skip); - return *this; - } - - /*!\brief Forward copy of this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator operator+(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp += skip; - } - - /*!\brief Non-member operator+ delegates to non-friend operator+. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend basic_iterator operator+(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it + skip; - } - - /*!\brief Decrement iterator by `skip`. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator & operator-=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - move_forward_backward(-skip); - return *this; - } - - /*!\brief Return decremented copy of this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator operator-(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp -= skip; - } - - /*!\brief Non-member operator- delegates to non-friend operator-. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend basic_iterator operator-(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it - skip; - } - - /*!\brief Return offset between two iterator's positions. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend difference_type operator-(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return static_cast(lhs.first_iterator - rhs.first_iterator); - } - /*!\brief Return offset between remote sentinel's position and this. * \attention This function is only available if sentinel_t and urng_t model std::sized_sentinel_for. */ @@ -566,17 +412,6 @@ class minstrobe_view::basic_iterator return static_cast(lhs.second_iterator - rhs); } - /*!\brief Move the iterator by a given offset and return the corresponding hash value. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - reference operator[](difference_type const n) const - //!\cond - requires std::random_access_iterator - //!\endcond - { - return *(*this + n); - } - //!\brief Return the minstrobe. value_type operator*() const noexcept { @@ -596,15 +431,6 @@ class minstrobe_view::basic_iterator //!\brief Iterator to the third strobe of minstrobe, if order is 3. urng_iterator_t third_iterator{}; - //!\brief Iterator to the left most value of the window and hence the second strobe of minstrobe for bidirectionality. - urng_iterator_t second_iterator_back{}; - - //!\brief Iterator to the left most value of the second window and hence the third strobe of minstrobe for bidirectionality, if order is 3. - urng_iterator_t third_iterator_back{}; - - //!\brief Iterator to first element in range. Needed for bidirectionality. - urng_iterator_t urng_first{}; - //!\brief Iterator to last element in range. urng_sentinel_t urng_sentinel{}; @@ -631,12 +457,10 @@ class minstrobe_view::basic_iterator { ++first_iterator; ++second_iterator; - ++second_iterator_back; if constexpr(order_3) { ++third_iterator; - ++third_iterator_back; } } @@ -645,12 +469,10 @@ class minstrobe_view::basic_iterator { --first_iterator; --second_iterator; - --second_iterator_back; if constexpr(order_3) { --third_iterator; - --third_iterator_back; } } @@ -659,14 +481,11 @@ class minstrobe_view::basic_iterator { second_iterator = first_iterator; std::ranges::advance(second_iterator, window_dist); - second_iterator_back = second_iterator; if constexpr(order_3) { third_iterator = second_iterator; - third_iterator_back = third_iterator; std::ranges::advance(third_iterator, window_size + window_dist - 1); - third_iterator_back = third_iterator; } for (int i = 1u; i < window_size; ++i) @@ -701,25 +520,6 @@ class minstrobe_view::basic_iterator } } - /*!\brief Increments or decrements iterator by `skip`. - * \param skip Amount to increment. - * \attention This function is only available if `urng_iterator_t` models std::random_access_iterator. - */ - void move_forward_backward(difference_type const skip) - //!\cond - requires std::random_access_iterator - //!\endcond - { - std::ranges::advance(first_iterator, skip); - window_values.clear(); - - if constexpr(order_3) - { - window_values3.clear(); - } - fill_window(); - } - /*!\brief Calculates the next minstrobe value. * \details * For the following windows, we remove the first window value (is now not in window_values) and add the new @@ -728,11 +528,11 @@ class minstrobe_view::basic_iterator void next_minstrobe() { advance_windows(); - if (second_iterator_back == urng_sentinel) + if (second_iterator == urng_sentinel) return; if constexpr(order_3) { - if (third_iterator_back == urng_sentinel) + if (third_iterator == urng_sentinel) return; } @@ -776,65 +576,6 @@ class minstrobe_view::basic_iterator --minstrobe_position_offset; } - - /*!\brief Calculates the previous minstrobe value. - * \details - * For the following windows, we remove the last window value (is now not in window_values) and add the new - * value that results from the window shifting. - */ - void prev_minstrobe() - requires std::ranges::bidirectional_range - { - if (second_iterator_back == urng_first) - return; - if constexpr(order_3) - { - if (third_iterator_back == urng_first) - return; - } - - retreat_windows(); - - minstrobe_value[0] = *first_iterator; - window_values.pop_back(); - window_values.push_front(*second_iterator_back); - - if (minstrobe_position_offset == (window_values.size() - 1)) - { - auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); - minstrobe_value[1] = *minstrobe_it; - minstrobe_position_offset = std::distance(std::begin(window_values), minstrobe_it); - } - - if (*second_iterator_back < minstrobe_value[1]) - { - minstrobe_value[1] = *second_iterator_back; - minstrobe_position_offset = 0; - } - - if constexpr(order_3) - { - window_values3.pop_back(); - window_values3.push_front(*third_iterator_back); - - if (minstrobe_position_offset3 == (window_values3.size() - 1)) - { - auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); - minstrobe_value[2] = *minstrobe_it3; - minstrobe_position_offset3 = std::distance(std::begin(window_values3), minstrobe_it3); - } - - if (*third_iterator_back <= minstrobe_value[2]) - { - minstrobe_value[2] = *third_iterator_back; - minstrobe_position_offset3 = 0; - } - - ++minstrobe_position_offset3; - } - - ++minstrobe_position_offset; - } }; //!\brief A deduction guide for the view class template. @@ -920,7 +661,7 @@ namespace seqan3::views * | std::ranges::input_range | *required* | *preserved* | * | std::ranges::forward_range | *required* | *preserved* | * | std::ranges::bidirectional_range | | *preserved* | - * | std::ranges::random_access_range | | *preserved* | + * | std::ranges::random_access_range | | *lost* | * | std::ranges::contiguous_range | | *lost* | * | | | | * | std::ranges::viewable_range | *required* | *guaranteed* | diff --git a/include/randstrobe.hpp b/include/randstrobe.hpp index 84106d9..d46f743 100644 --- a/include/randstrobe.hpp +++ b/include/randstrobe.hpp @@ -159,54 +159,10 @@ class randstrobe_view : public std::ranges::view_interface) - //!\endcond { return {}; } - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() noexcept - //!\cond - requires std::ranges::random_access_range - //!\endcond - { - // If the underlying range supports random access, then we can just jump to the end. - return begin()+size(); - - } - - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - auto end() const noexcept - //!\cond - requires const_iterable_range && std::ranges::random_access_range - //!\endcond - { - // If the underlying range supports random access, then we can just jump to the end. - return begin()+size(); - } - /*!\brief Returns the size of the range, if the underlying range is a std::ranges::sized_range. * \returns Size of range. */ @@ -250,9 +206,9 @@ class randstrobe_view::basic_iterator using pointer = void; //!\brief Reference to `value_type`. using reference = value_type; - //!\brief Tag this class as a bidirectional iterator. - using iterator_category = std::random_access_iterator_tag; - //!\brief Tag this class as a bidirectional iterator. + //!\brief Tag this class as a forward iterator. + using iterator_category = std::forward_iterator_tag; + //!\brief Tag this class as a forward iterator. using iterator_concept = iterator_category; //!\} @@ -275,8 +231,6 @@ class randstrobe_view::basic_iterator first_iterator{std::move(it.first_iterator)}, second_iterator{std::move(it.second_iterator)}, third_iterator{std::move(it.third_iterator)}, - second_iterator_back{std::move(it.second_iterator_back)}, - third_iterator_back{std::move(it.third_iterator_back)}, urng_sentinel{std::move(it.urng_sentinel)}, window_dist{std::move(it.window_dist)}, window_size{std::move(it.window_size)} @@ -299,10 +253,7 @@ class randstrobe_view::basic_iterator first_iterator{first_iterator}, second_iterator{first_iterator}, third_iterator{first_iterator}, - second_iterator_back{first_iterator}, - third_iterator_back{first_iterator}, urng_sentinel{std::move(urng_sentinel)}, - urng_first{first_iterator}, window_dist{window_dist}, window_size{window_size} { @@ -433,111 +384,6 @@ class randstrobe_view::basic_iterator return tmp; } - /*!\brief Pre-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator & operator--() noexcept - //!\cond - requires std::ranges::bidirectional_range - //!\endcond - { - prev_randstrobe(); - return *this; - } - - /*!\brief Post-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator operator--(int) noexcept - //!\cond - requires std::ranges::bidirectional_range - //!\endcond - { - basic_iterator tmp{*this}; - prev_randstrobe(); - return tmp; - } - /*!\brief Forward this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator & operator+=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - move_forward_backward(skip); - return *this; - } - - /*!\brief Forward copy of this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator operator+(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp += skip; - } - - /*!\brief Non-member operator+ delegates to non-friend operator+. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend basic_iterator operator+(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it + skip; - } - - /*!\brief Decrement iterator by `skip`. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator & operator-=(difference_type const skip) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - move_forward_backward(-skip); - return *this; - } - - /*!\brief Return decremented copy of this iterator. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - basic_iterator operator-(difference_type const skip) const noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - basic_iterator tmp{*this}; - return tmp -= skip; - } - - /*!\brief Non-member operator- delegates to non-friend operator-. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend basic_iterator operator-(difference_type const skip, basic_iterator const & it) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return it - skip; - } - - /*!\brief Return offset between two iterator's positions. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - friend difference_type operator-(basic_iterator const & lhs, basic_iterator const & rhs) noexcept - //!\cond - requires std::random_access_iterator - //!\endcond - { - return static_cast(lhs.first_iterator - rhs.first_iterator); - } - /*!\brief Return offset between remote sentinel's position and this. * \attention This function is only available if sentinel_t and urng_t model std::sized_sentinel_for. */ @@ -566,17 +412,6 @@ class randstrobe_view::basic_iterator return static_cast(lhs.second_iterator - rhs); } - /*!\brief Move the iterator by a given offset and return the corresponding hash value. - * \attention This function is only available if `urng_t` models std::random_access_iterator. - */ - reference operator[](difference_type const n) const - //!\cond - requires std::random_access_iterator - //!\endcond - { - return *(*this + n); - } - //!\brief Return the randstrobe. value_type operator*() const noexcept { @@ -596,15 +431,6 @@ class randstrobe_view::basic_iterator //!\brief Iterator to the third strobe of randstrobe, if order is 3. urng_iterator_t third_iterator{}; - //!\brief Iterator to the left most value of the window and hence the second strobe of randstrobe for bidirectionality. - urng_iterator_t second_iterator_back{}; - - //!\brief Iterator to the left most value of the second window and hence the third strobe of randstrobe for bidirectionality, if order is 3. - urng_iterator_t third_iterator_back{}; - - //!\brief Iterator to first element in range. Needed for bidirectionality. - urng_iterator_t urng_first{}; - //!\brief Iterator to last element in range. urng_sentinel_t urng_sentinel{}; @@ -646,16 +472,13 @@ class randstrobe_view::basic_iterator second_iterator = first_iterator; std::ranges::advance(second_iterator, window_dist); - second_iterator_back = second_iterator; minimum = *second_iterator; minimum_hash = linking(*first_iterator, *second_iterator); if constexpr(order_3) { third_iterator = second_iterator; - third_iterator_back = third_iterator; std::ranges::advance(third_iterator, window_size + window_dist - 1); - third_iterator_back = third_iterator; } for (int i = 1u; i < window_size; ++i) @@ -693,19 +516,6 @@ class randstrobe_view::basic_iterator } } - /*!\brief Increments or decrements iterator by `skip`. - * \param skip Amount to increment. - * \attention This function is only available if `urng_iterator_t` models std::random_access_iterator. - */ - void move_forward_backward(difference_type const skip) - //!\cond - requires std::random_access_iterator - //!\endcond - { - std::ranges::advance(first_iterator, skip); - fill_window(); - } - /*!\brief Calculates the next randstrobe value. * \details * For the following windows, we remove the first window value (is now not in window_values) and add the new @@ -714,34 +524,14 @@ class randstrobe_view::basic_iterator void next_randstrobe() { ++first_iterator; - if (second_iterator_back == urng_sentinel) - return; - if constexpr(order_3) - { - if (third_iterator_back == urng_sentinel) - return; - } - - fill_window(); - } - - /*!\brief Calculates the previous randstrobe value. - * \details - * For the following windows, we remove the last window value (is now not in window_values) and add the new - * value that results from the window shifting. - */ - void prev_randstrobe() - requires std::ranges::bidirectional_range - { - if (second_iterator_back == urng_first) + if (second_iterator == urng_sentinel) return; if constexpr(order_3) { - if (third_iterator_back == urng_first) + if (third_iterator == urng_sentinel) return; } - --first_iterator; fill_window(); } }; @@ -828,8 +618,8 @@ namespace seqan3::views * |----------------------------------|:----------------------------------:|:--------------------------------:| * | std::ranges::input_range | *required* | *preserved* | * | std::ranges::forward_range | *required* | *preserved* | - * | std::ranges::bidirectional_range | | *preserved* | - * | std::ranges::random_access_range | | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | * | std::ranges::contiguous_range | | *lost* | * | | | | * | std::ranges::viewable_range | *required* | *guaranteed* | diff --git a/test/api/hybridstrobe_test.cpp b/test/api/hybridstrobe_test.cpp index 888c27b..29b3ea1 100644 --- a/test/api/hybridstrobe_test.cpp +++ b/test/api/hybridstrobe_test.cpp @@ -37,7 +37,7 @@ using order3_iterator_type = std::ranges::iterator_t struct iterator_fixture : public ::testing::Test { - using iterator_tag = std::random_access_iterator_tag; + using iterator_tag = std::forward_iterator_tag; static constexpr bool const_iterable = true; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; @@ -51,7 +51,7 @@ struct iterator_fixture : public ::testing::Test template <> struct iterator_fixture : public ::testing::Test { - using iterator_tag = std::random_access_iterator_tag; + using iterator_tag = std::forward_iterator_tag; static constexpr bool const_iterable = true; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; @@ -100,13 +100,6 @@ class hybridstrobe_test : public ::testing::Test result_t result3_ungapped_start{{152,191},{97,111},{134,242}}; result_t result3_gapped_start{{8,11},{5,12},{10,11}}; - // Reverse complement: ctaaacgtcgccgt - // kmers: ctaa, taaa, aaac, aacg, acgt, cgtc, gtcg, tcgc, cgcc, gccg, ccgt - // ungapped Hashes: 112, 192, 1, 6, 27, 109, 182, 217, 101, 150, 91 - // gapped Hashes: 4, 12, 1, 2, 3, 5, 10, 13, 5, 10, 7 - result_t result3_rev_comp_ungapped{{112,6},{192,1},{1,109},{6,27},{27,109},{109,101}}; - result_t result3_rev_comp_gapped{{4,2},{12,1},{1,5},{2,5},{3,5},{5,7}}; - result_t result3_1{{0,0,0},{0,0,0},{0,0,0},{0,0,0}}; // Same result for ungapped and gapped // ACGGCGACGTTTAG @@ -119,8 +112,6 @@ class hybridstrobe_test : public ::testing::Test // start at A gapped hybridstrobes: G--GC--CA--T, G--AA--TC--T, C--CA--TG--T result_t order_3_ungapped{{26,152,27},{105,166,134},{166,97,111},{152,27,252},{97,27,252}}; result_t order_3_gapped{{2,8,3},{5,5,7},{10,5,7},{8,3,12},{5,7,14}}; - result_t order_3_rev_comp_ungapped{{112,1,109},{192,1,109},{1,27,217},{6,27,217},{27,109,101}}; - result_t order_3_rev_comp_gapped{{4,1,5},{12,1,5},{1,3,13},{2,10,10},{3,5,5}}; }; template @@ -128,7 +119,8 @@ void compare_types(adaptor_t v) { EXPECT_TRUE(std::ranges::input_range); EXPECT_TRUE(std::ranges::forward_range); - EXPECT_TRUE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); EXPECT_TRUE(std::ranges::view); EXPECT_TRUE(std::ranges::sized_range); EXPECT_TRUE(seqan3::const_iterable_range); @@ -142,13 +134,9 @@ TYPED_TEST(hybridstrobe_view_properties_test, concepts) auto v = text | kmer_view | hybridstrobe_view; compare_types(v); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::random_access_range); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::common_range); auto v2 = seqan3::detail::hybridstrobe_view(text | kmer_view,1,3); compare_types(v2); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::random_access_range); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::common_range); } TYPED_TEST(hybridstrobe_view_properties_test, different_inputs_kmer_hash) @@ -159,47 +147,33 @@ TYPED_TEST(hybridstrobe_view_properties_test, different_inputs_kmer_hash) result_t gapped{{3,5},{5,3},{10,3},{12,5},{5,12},{10,11}}; EXPECT_RANGE_EQ(ungapped, text | kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(std::views::reverse(ungapped), std::views::reverse(text | kmer_view | hybridstrobe_view)); - EXPECT_RANGE_EQ(std::views::reverse(gapped), std::views::reverse(text | gapped_kmer_view | hybridstrobe_view)); result_t ungapped3{{27,109,97},{109,216,27},{182,134,191},{216,97,111},{97,27,252}}; result_t gapped3{{3,5,5},{5,5,7},{10,5,7},{12,5,7},{5,7,14}}; EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::hybridstrobe_view(text | kmer_view,1,3))); EXPECT_RANGE_EQ(gapped3, (seqan3::detail::hybridstrobe_view(text | gapped_kmer_view,1,3))); - EXPECT_RANGE_EQ(std::views::reverse(ungapped3), std::views::reverse(seqan3::detail::hybridstrobe_view(text | kmer_view,1,3))); - EXPECT_RANGE_EQ(std::views::reverse(gapped3), std::views::reverse(seqan3::detail::hybridstrobe_view(text | gapped_kmer_view,1,3))); } TEST_F(hybridstrobe_test, ungapped_kmer_hash) { EXPECT_RANGE_EQ(result1, text1 | kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(result3_rev_comp_ungapped, text3 | seqan3::views::complement | std::views::reverse | kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(result3_1, (seqan3::detail::hybridstrobe_view(text1 | kmer_view,1,3))); EXPECT_RANGE_EQ(order_3_ungapped, (seqan3::detail::hybridstrobe_view(text3 | kmer_view,1,3))); - EXPECT_RANGE_EQ(order_3_rev_comp_ungapped, (seqan3::detail::hybridstrobe_view(text3 | seqan3::views::complement | std::views::reverse | kmer_view,1,3))); - } TEST_F(hybridstrobe_test, gapped_kmer_hash) { EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(result3_rev_comp_gapped, text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(result3_1, (seqan3::detail::hybridstrobe_view(text1 | gapped_kmer_view,1,3))); EXPECT_RANGE_EQ(order_3_gapped, (seqan3::detail::hybridstrobe_view(text3 | gapped_kmer_view,1,3))); - EXPECT_RANGE_EQ(order_3_rev_comp_gapped, (seqan3::detail::hybridstrobe_view(text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view,1,3))); } TEST_F(hybridstrobe_test, combinability) { - EXPECT_RANGE_EQ(std::views::reverse(result3_rev_comp_ungapped), std::views::reverse(text3 | seqan3::views::complement | std::views::reverse | kmer_view | hybridstrobe_view)); - EXPECT_RANGE_EQ(std::views::reverse(result3_rev_comp_gapped), std::views::reverse(text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view | hybridstrobe_view)); - EXPECT_RANGE_EQ(std::views::reverse(order_3_rev_comp_ungapped), std::views::reverse((seqan3::detail::hybridstrobe_view(text3 | seqan3::views::complement | std::views::reverse | kmer_view,1,3)))); - EXPECT_RANGE_EQ(std::views::reverse(order_3_rev_comp_gapped), std::views::reverse((seqan3::detail::hybridstrobe_view(text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view,1,3)))); - auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | hybridstrobe_view); diff --git a/test/api/minstrobe_test.cpp b/test/api/minstrobe_test.cpp index 86b9873..088bd0c 100644 --- a/test/api/minstrobe_test.cpp +++ b/test/api/minstrobe_test.cpp @@ -37,7 +37,7 @@ using order3_iterator_type = std::ranges::iterator_t struct iterator_fixture : public ::testing::Test { - using iterator_tag = std::random_access_iterator_tag; + using iterator_tag = std::forward_iterator_tag; static constexpr bool const_iterable = true; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; @@ -51,7 +51,7 @@ struct iterator_fixture : public ::testing::Test template <> struct iterator_fixture : public ::testing::Test { - using iterator_tag = std::random_access_iterator_tag; + using iterator_tag = std::forward_iterator_tag; static constexpr bool const_iterable = true; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; @@ -100,13 +100,6 @@ class minstrobe_test : public ::testing::Test result_t result3_ungapped_start{{152,27},{97,27},{134,111}}; result_t result3_gapped_start{{8,3},{5,3},{10,7}}; - // Reverse complement: ctaaacgtcgccgt - // kmers: ctaa, taaa, aaac, aacg, acgt, cgtc, gtcg, tcgc, cgcc, gccg, ccgt - // ungapped Hashes: 112, 192, 1, 6, 27, 109, 182, 217, 101, 150, 91 - // gapped Hashes: 4, 12, 1, 2, 3, 5, 10, 13, 5, 10, 7 - result_t result3_rev_comp_ungapped{{112,1},{192,6},{1,27},{6,101},{27,101},{109,91}}; - result_t result3_rev_comp_gapped{{4,1},{12,2},{1,3},{2,5},{3,5},{5,5}}; - result_t result3_1{{0,0,0},{0,0,0},{0,0,0}}; // Same result for ungapped and gapped // ACGGCGACGTTTAG @@ -128,7 +121,8 @@ void compare_types(adaptor_t v) { EXPECT_TRUE(std::ranges::input_range); EXPECT_TRUE(std::ranges::forward_range); - EXPECT_TRUE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); EXPECT_TRUE(std::ranges::view); EXPECT_TRUE(std::ranges::sized_range); EXPECT_TRUE(seqan3::const_iterable_range); @@ -142,13 +136,9 @@ TYPED_TEST(minstrobe_view_properties_test, concepts) auto v = text | kmer_view | minstrobe_view; compare_types(v); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::random_access_range); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::common_range); auto v2 = seqan3::detail::minstrobe_view(text | kmer_view,1,3); compare_types(v2); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::random_access_range); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::common_range); } TYPED_TEST(minstrobe_view_properties_test, different_inputs_kmer_hash) @@ -170,7 +160,6 @@ TEST_F(minstrobe_test, ungapped_kmer_hash) { EXPECT_RANGE_EQ(result1, text1 | kmer_view | minstrobe_view); EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | minstrobe_view); - EXPECT_RANGE_EQ(result3_rev_comp_ungapped, text3 | seqan3::views::complement | std::views::reverse | kmer_view | minstrobe_view); EXPECT_RANGE_EQ(result3_1, (seqan3::detail::minstrobe_view(text1 | kmer_view,1,3))); EXPECT_RANGE_EQ(result3_3_ungapped, (seqan3::detail::minstrobe_view(text3 | kmer_view,1,3))); @@ -180,7 +169,6 @@ TEST_F(minstrobe_test, gapped_kmer_hash) { EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | minstrobe_view); EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | minstrobe_view); - EXPECT_RANGE_EQ(result3_rev_comp_gapped, text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view | minstrobe_view); EXPECT_RANGE_EQ(result3_1, (seqan3::detail::minstrobe_view(text1 | gapped_kmer_view,1,3))); EXPECT_RANGE_EQ(result3_3_gapped, (seqan3::detail::minstrobe_view(text3 | gapped_kmer_view,1,3))); @@ -188,9 +176,6 @@ TEST_F(minstrobe_test, gapped_kmer_hash) TEST_F(minstrobe_test, combinability) { - EXPECT_RANGE_EQ(std::views::reverse(result3_rev_comp_ungapped), std::views::reverse(text3 | seqan3::views::complement | std::views::reverse | kmer_view | minstrobe_view)); - EXPECT_RANGE_EQ(std::views::reverse(result3_rev_comp_gapped), std::views::reverse(text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view | minstrobe_view)); - auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | kmer_view | minstrobe_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | minstrobe_view); diff --git a/test/api/randstrobe_test.cpp b/test/api/randstrobe_test.cpp index 9539862..6175a4f 100644 --- a/test/api/randstrobe_test.cpp +++ b/test/api/randstrobe_test.cpp @@ -37,7 +37,7 @@ using order3_iterator_type = std::ranges::iterator_t struct iterator_fixture : public ::testing::Test { - using iterator_tag = std::random_access_iterator_tag; + using iterator_tag = std::forward_iterator_tag; static constexpr bool const_iterable = true; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; @@ -51,7 +51,7 @@ struct iterator_fixture : public ::testing::Test template <> struct iterator_fixture : public ::testing::Test { - using iterator_tag = std::random_access_iterator_tag; + using iterator_tag = std::forward_iterator_tag; static constexpr bool const_iterable = true; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; @@ -94,13 +94,6 @@ class randstrobe_test : public ::testing::Test result_t result3_ungapped_start{{152,191},{97,252},{134,191}}; result_t result3_gapped_start{{8,11},{5,12},{10,7}}; - // Reverse complement: ctaaacgtcgccgt - // kmers: ctaa, taaa, aaac, aacg, acgt, cgtc, gtcg, tcgc, cgcc, gccg, ccgt - // ungapped Hashes: 112, 192, 1, 6, 27, 109, 182, 217, 101, 150, 91 - // gapped Hashes: 4, 12, 1, 2, 3, 5, 10, 13, 5, 10, 7 - result_t result3_rev_comp_ungapped{{112,1},{192,182},{1,27},{6,109},{27,101},{109,150}}; - result_t result3_rev_comp_gapped{{4,1},{12,5},{1,3},{2,5},{3,13},{5,13}}; - result_t result3_1{{0,0,0},{0,0,0},{0,0,0}}; // Same result for ungapped and gapped // ACGGCGACGTTTAG @@ -118,7 +111,8 @@ void compare_types(adaptor_t v) { EXPECT_TRUE(std::ranges::input_range); EXPECT_TRUE(std::ranges::forward_range); - EXPECT_TRUE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); EXPECT_TRUE(std::ranges::view); EXPECT_TRUE(std::ranges::sized_range); EXPECT_TRUE(seqan3::const_iterable_range); @@ -132,13 +126,9 @@ TYPED_TEST(randstrobe_view_properties_test, concepts) auto v = text | kmer_view | randstrobe_view; compare_types(v); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::random_access_range); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::common_range); auto v2 = seqan3::detail::randstrobe_view(text | kmer_view,1,3); compare_types(v2); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::random_access_range); - EXPECT_EQ(std::ranges::random_access_range, std::ranges::common_range); } TYPED_TEST(randstrobe_view_properties_test, different_inputs_kmer_hash) @@ -163,7 +153,6 @@ TEST_F(randstrobe_test, ungapped_kmer_hash) { EXPECT_RANGE_EQ(result1, text1 | kmer_view | randstrobe_view); EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | randstrobe_view); - EXPECT_RANGE_EQ(result3_rev_comp_ungapped, text3 | seqan3::views::complement | std::views::reverse | kmer_view | randstrobe_view); EXPECT_RANGE_EQ(result3_1, (seqan3::detail::randstrobe_view(text1 | kmer_view,1,3))); EXPECT_RANGE_EQ(result3_3_ungapped, (seqan3::detail::randstrobe_view(text3 | kmer_view,1,3))); @@ -173,7 +162,6 @@ TEST_F(randstrobe_test, gapped_kmer_hash) { EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | randstrobe_view); EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | randstrobe_view); - EXPECT_RANGE_EQ(result3_rev_comp_gapped, text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view | randstrobe_view); EXPECT_RANGE_EQ(result3_1, (seqan3::detail::randstrobe_view(text1 | gapped_kmer_view,1,3))); EXPECT_RANGE_EQ(result3_3_gapped, (seqan3::detail::randstrobe_view(text3 | gapped_kmer_view,1,3))); @@ -181,9 +169,6 @@ TEST_F(randstrobe_test, gapped_kmer_hash) TEST_F(randstrobe_test, combinability) { - EXPECT_RANGE_EQ(std::views::reverse(result3_rev_comp_ungapped), std::views::reverse(text3 | seqan3::views::complement | std::views::reverse | kmer_view | randstrobe_view)); - EXPECT_RANGE_EQ(std::views::reverse(result3_rev_comp_gapped), std::views::reverse(text3 | seqan3::views::complement | std::views::reverse | gapped_kmer_view | randstrobe_view)); - auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | kmer_view | randstrobe_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | randstrobe_view); From 7517742b23a90645206459788f75b4cc43949016 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Tue, 24 Jan 2023 16:27:00 +0100 Subject: [PATCH 06/34] [MISC] Speed up unique. --- src/compare.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/compare.cpp b/src/compare.cpp index d2b3dc8..317bc69 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -525,17 +525,6 @@ void speed(std::vector sequence_files, urng_t input_view, outfile.close(); } -uint64_t count_singletons(robin_hood::unordered_node_map & hash_table) -{ - uint64_t singletons{0}; - for (auto && hash: hash_table) - { - if (hash_table[hash.first] == 1) - ++singletons; - } - return singletons; -} - // Input files should be the output files from count void unique(std::vector input_files, std::filesystem::path oname) { @@ -545,18 +534,22 @@ void unique(std::vector input_files, std::filesystem::pat for (int i = 0; i < input_files.size(); ++i) { - robin_hood::unordered_node_map hash_table{}; + //robin_hood::unordered_node_map hash_table{}; infile.open(input_files[i], std::ios::binary); uint64_t submer; uint16_t submer_count; + uint64_t singletons{0}; + uint64_t all_counts{0}; while(infile.read((char*)&submer, sizeof(submer))) { infile.read((char*)&submer_count, sizeof(submer_count)); - hash_table[submer] = submer_count; + if (submer_count == 1) + ++singletons; + all_counts++; } infile.close(); - outfile << input_files[i].stem() << "\t" << (count_singletons(hash_table) * 100.0)/hash_table.size() << "\n"; + outfile << input_files[i].stem() << "\t" << (singletons * 100.0)/all_counts << "\n"; } outfile.close(); From 9e7045c5a05079ee319a907302c52431151ae6e4 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Tue, 24 Jan 2023 17:01:32 +0100 Subject: [PATCH 07/34] [MISC] Python script to mutate fasta file with certain error rate. --- .gitignore | 2 ++ src/snakemake/accuracy/add_errors.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 src/snakemake/accuracy/add_errors.py diff --git a/.gitignore b/.gitignore index 7d3c806..d7a2930 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,8 @@ src/snakemake/ !src/snakemake/genmap/Snakefile !src/snakemake/genmap/genmap_uniqueness.py +!src/snakemake/accuracy/add_errors.py + !src/snakemake/speed/README !src/snakemake/speed/Snakefile !src/snakemake/speed/plot_speed.py diff --git a/src/snakemake/accuracy/add_errors.py b/src/snakemake/accuracy/add_errors.py new file mode 100644 index 0000000..124556d --- /dev/null +++ b/src/snakemake/accuracy/add_errors.py @@ -0,0 +1,23 @@ +import os +import sys +import random + +infile = sys.argv[1] +outfile = sys.argv[2] +error = int(sys.argv[3]) + +def mutated_seq(sequence): + positions = random.sample(range(len(sequence)), error) + seqs = list(sequence) + for pos in positions: + seqs[pos] = random.choice([i for i in "ACGT" if (i != sequence[pos])]) + return "".join(seqs) + +with open(outfile, 'w') as o: + with open(infile, 'r') as f: + for line in f: + if (line[0] == '>'): + o.write(line) + else: + o.write(mutated_seq(line.strip())) + o.write("\n") From 9ce21d34dd3b84913d7fc7e94b014c2a813fc3df Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Wed, 25 Jan 2023 14:51:30 +0100 Subject: [PATCH 08/34] [FIX] Speed minstrobemers. --- include/minstrobe.hpp | 139 ++++++++++++++++++++++++------------ include/minstrobe_hash.hpp | 22 ++---- src/compare.cpp | 4 +- test/api/minstrobe_test.cpp | 86 +++++++++++----------- 4 files changed, 144 insertions(+), 107 deletions(-) diff --git a/include/minstrobe.hpp b/include/minstrobe.hpp index a72dcf5..93161f4 100644 --- a/include/minstrobe.hpp +++ b/include/minstrobe.hpp @@ -21,6 +21,8 @@ #include #include +#include "shared.hpp" + namespace seqan3::detail { // --------------------------------------------------------------------------------------------------------------------- @@ -60,6 +62,9 @@ class minstrobe_view : public std::ranges::view_interface //!\brief The number of elements in a window. size_t window_size{}; + //!\brief The multiplicator. + uint64_t multi{}; + template class basic_iterator; @@ -84,11 +89,13 @@ class minstrobe_view : public std::ranges::view_interface * std::ranges::forward_range. * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator. */ - minstrobe_view(urng_t urange, size_t const window_dist, size_t const window_size) : + minstrobe_view(urng_t urange, size_t const window_dist, size_t const window_size, uint64_t const multi) : urange{std::move(urange)}, window_dist{window_dist}, - window_size{window_size} + window_size{window_size}, + multi{multi} {} /*!\brief Construct from a non-view that can be view-wrapped and the two (lower and upper) offsets @@ -99,16 +106,18 @@ class minstrobe_view : public std::ranges::view_interface * std::ranges::forward_range. * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator. */ template //!\cond requires (std::ranges::viewable_range && std::constructible_from>>) //!\endcond - minstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size) : + minstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) : urange{std::views::all(std::forward(urange))}, window_dist{window_dist}, - window_size{window_size} + window_size{window_size}, + multi{multi} {} /*!\name Iterators @@ -132,7 +141,8 @@ class minstrobe_view : public std::ranges::view_interface return {std::ranges::begin(urange), std::ranges::end(urange), window_dist, - window_size}; + window_size, + multi}; } //!\copydoc begin() @@ -144,7 +154,8 @@ class minstrobe_view : public std::ranges::view_interface return {std::ranges::cbegin(urange), std::ranges::cend(urange), window_dist, - window_size}; + window_size, + multi}; } /*!\brief Returns an iterator to the element following the last element of the range. @@ -199,9 +210,7 @@ class minstrobe_view::basic_iterator //!\brief Type for distances between iterators. using difference_type = typename std::iter_difference_t; //typename std::ranges::range_difference_t; //!\brief Value type of the iterator. - using value_t = std::ranges::range_value_t; - //!\brief Value type of the output. - using value_type = std::vector; + using value_type = std::ranges::range_value_t; //!\brief The pointer type. using pointer = void; //!\brief Reference to `value_type`. @@ -249,7 +258,8 @@ class minstrobe_view::basic_iterator basic_iterator(urng_iterator_t first_iterator, urng_sentinel_t urng_sentinel, size_t window_dist, - size_t window_size) : + size_t window_size, + uint64_t power_multi) : first_iterator{first_iterator}, second_iterator{first_iterator}, third_iterator{first_iterator}, @@ -267,6 +277,16 @@ class minstrobe_view::basic_iterator throw std::invalid_argument{"The given window size is too small.\n" "Please choose a bigger window size greater than 0."}; + if constexpr (order_3) + { + multiplicator = my_pow(4, power_multi*2); + multiplicator3 = my_pow(4, power_multi); + } + else + { + multiplicator = my_pow(4, power_multi); + } + fill_window(); } //!\} @@ -422,6 +442,9 @@ class minstrobe_view::basic_iterator //!\brief The minstrobe value. value_type minstrobe_value{}; + //!\brief The minstrobe value vector. + std::vector minstrobe_value_vec{}; + //!\brief Iterator to the first strobe of minstrobe. urng_iterator_t first_iterator{}; @@ -435,10 +458,10 @@ class minstrobe_view::basic_iterator urng_sentinel_t urng_sentinel{}; //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current minstrobe. - std::deque window_values{}; + std::deque window_values{}; //!\brief Stored values per window for order 3. - std::deque window_values3{}; + std::deque window_values3{}; //!\brief The distance between the first strobe and the second. size_t window_dist{}; @@ -452,6 +475,12 @@ class minstrobe_view::basic_iterator //!\brief The offset of the minstrobe for order 3. size_t minstrobe_position_offset3{}; + //!\brief The multiplicator. + uint64_t multiplicator{}; + + //!\brief The multiplicator for order 3. + uint64_t multiplicator3{}; + //!\brief Advances the window of the iterators to the next position. void advance_windows() { @@ -464,16 +493,13 @@ class minstrobe_view::basic_iterator } } - //!\brief Retreat the window of the iterators to the next position. - void retreat_windows() + //!\brief Function that combines strobes. + void combine_strobes() { - --first_iterator; - --second_iterator; - - if constexpr(order_3) - { - --third_iterator; - } + if constexpr (order_3) + minstrobe_value = minstrobe_value_vec[0]*multiplicator + minstrobe_value_vec[1]*multiplicator3 + minstrobe_value_vec[2]; + else + minstrobe_value = minstrobe_value_vec[0]*multiplicator + minstrobe_value_vec[1]; } //!\brief Fills window. @@ -506,18 +532,19 @@ class minstrobe_view::basic_iterator window_values3.push_back(*third_iterator); } - auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); + auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); minstrobe_position_offset = std::distance(std::begin(window_values), minstrobe_it); if constexpr(order_3) { - auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); + auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); minstrobe_position_offset3 = std::distance(std::begin(window_values3), minstrobe_it3); - minstrobe_value = {*first_iterator, *minstrobe_it, *minstrobe_it3}; + minstrobe_value_vec = {*first_iterator, *minstrobe_it, *minstrobe_it3}; } else { - minstrobe_value = {*first_iterator, *minstrobe_it}; + minstrobe_value_vec = {*first_iterator, *minstrobe_it}; } + combine_strobes(); } /*!\brief Calculates the next minstrobe value. @@ -536,20 +563,20 @@ class minstrobe_view::basic_iterator return; } - minstrobe_value[0] = *first_iterator; + minstrobe_value_vec[0] = *first_iterator; window_values.pop_front(); window_values.push_back(*second_iterator); if (minstrobe_position_offset == 0) { - auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); - minstrobe_value[1] = *minstrobe_it; + auto minstrobe_it = std::ranges::min_element(window_values, std::less_equal{}); + minstrobe_value_vec[1] = *minstrobe_it; minstrobe_position_offset = std::distance(std::begin(window_values), minstrobe_it) + 1; } - if (*second_iterator <= minstrobe_value[1]) + if (*second_iterator <= minstrobe_value_vec[1]) { - minstrobe_value[1] = *second_iterator; + minstrobe_value_vec[1] = *second_iterator; minstrobe_position_offset = window_values.size(); } @@ -560,31 +587,32 @@ class minstrobe_view::basic_iterator if (minstrobe_position_offset3 == 0) { - auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); - minstrobe_value[2] = *minstrobe_it3; + auto minstrobe_it3 = std::ranges::min_element(window_values3, std::less_equal{}); + minstrobe_value_vec[2] = *minstrobe_it3; minstrobe_position_offset3 = std::distance(std::begin(window_values3), minstrobe_it3) + 1; } - if (*third_iterator <= minstrobe_value[2]) + if (*third_iterator <= minstrobe_value_vec[2]) { - minstrobe_value[2] = *third_iterator; + minstrobe_value_vec[2] = *third_iterator; minstrobe_position_offset3 = window_values3.size(); } --minstrobe_position_offset3; } + combine_strobes(); --minstrobe_position_offset; } }; //!\brief A deduction guide for the view class template. template -minstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> minstrobe_view>; +minstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size, uint64_t const multi) -> minstrobe_view>; //!\brief A deduction guide for the view class template. template -minstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> minstrobe_view, ord>; +minstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size, uint64_t const multi) -> minstrobe_view, ord>; // --------------------------------------------------------------------------------------------------------------------- // minstrobe_fn (adaptor definition) @@ -596,9 +624,15 @@ minstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> struct minstrobe_fn { //!\brief Store the number of values in two windows and return a range adaptor closure object. - constexpr auto operator()(const size_t window_dist, const size_t window_size) const + constexpr auto operator()(bool order3, const size_t window_dist, const size_t window_size, uint64_t const multi) const + { + return adaptor_from_functor{*this, window_dist, window_size, multi, order3}; + } + + //!\brief Store the number of values in two windows and return a range adaptor closure object. + constexpr auto operator()(const size_t window_dist, const size_t window_size, uint64_t const multi) const { - return adaptor_from_functor{*this, window_dist, window_size}; + return adaptor_from_functor{*this, window_dist, window_size, multi}; } /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower @@ -610,19 +644,35 @@ struct minstrobe_fn * \param[in] window_size The number of elements in a window. * \returns A range of the converted values in vectors of size 2. */ - template - constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size) const + template + constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) const { static_assert(std::ranges::viewable_range, "The range parameter to views::minstrobe cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::minstrobe must model std::ranges::forward_range."); - if (window_size <= window_dist) - throw std::invalid_argument{"The chosen min and max windows are not valid." - "Please choose a window_size greater than window_dist."}; + return minstrobe_view{urange, window_dist, window_size, multi}; + } + + /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower + * offset and another integer indicating the upper offset of the second window. + * \tparam urng_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_dist The offset for the position of the next window from the previous one. + * \param[in] window_size The number of elements in a window. + * \returns A range of the converted values in vectors of size 2. + */ + template + constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi, bool order3) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::minstrobe cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minstrobe must model std::ranges::forward_range."); - return minstrobe_view{urange, window_dist, window_size}; + return minstrobe_view{urange, window_dist, window_size, multi}; } }; //![adaptor_def] @@ -638,6 +688,7 @@ namespace seqan3::views * \param[in] urange The range being processed. [parameter is omitted in pipe notation] * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator used to combine strobes. Should be the shape.count(). * \returns A range of std::totally_ordered where each value is a vector of size 2. See below for the * properties of the returned range. * \ingroup search_views diff --git a/include/minstrobe_hash.hpp b/include/minstrobe_hash.hpp index 4a1f5c2..57ad986 100644 --- a/include/minstrobe_hash.hpp +++ b/include/minstrobe_hash.hpp @@ -84,11 +84,7 @@ struct minstrobe2_hash_fn | std::views::transform([seed] (uint64_t i) {return i ^ seed.get();}); - - auto minstrobes = seqan3::detail::minstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - uint64_t multiplicator = my_pow(4, shape.count()); - auto forward = std::views::transform(minstrobes, [multiplicator] (std::vector i) - {return combine_strobes(multiplicator, i[0], i[1]);}); + auto forward = seqan3::detail::minstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); auto rev_hashed_values = std::forward(urange) | seqan3::views::complement | std::views::reverse @@ -97,9 +93,8 @@ struct minstrobe2_hash_fn {return i ^ seed.get();}); // Todo: Instead of using vectors, use the std::views::reverse function and zip, but the view reverse is very slow in comparison. - auto rev_minstrobes = seqan3::detail::minstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - auto reverse = std::views::transform(rev_minstrobes, [multiplicator] (std::vector i) - {return combine_strobes(multiplicator, i[0], i[1]);}); + auto reverse = seqan3::detail::minstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); + std::vector rev{}; for(auto && h : reverse) @@ -180,12 +175,7 @@ struct minstrobe3_hash_fn {return i ^ seed.get();}); - auto minstrobes = seqan3::detail::minstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - uint64_t multiplicator = my_pow(4, shape.count()*2); - uint64_t multiplicator2 = my_pow(4, shape.count()); - auto forward = std::views::transform(minstrobes, [multiplicator, multiplicator2] (std::vector i) - {return combine_strobes(multiplicator, multiplicator2, i[0], i[1], i[2]);}); - + auto forward = seqan3::detail::minstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); auto rev_hashed_values = std::forward(urange) | seqan3::views::complement | std::views::reverse @@ -193,9 +183,7 @@ struct minstrobe3_hash_fn | std::views::transform([seed] (uint64_t i) {return i ^ seed.get();}); - auto rev_minstrobes = seqan3::detail::minstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - auto reverse = std::views::transform(rev_minstrobes, [multiplicator, multiplicator2] (std::vector i) - {return combine_strobes(multiplicator, multiplicator2, i[0], i[1], i[2]);}); + auto reverse = seqan3::detail::minstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); std::vector rev{}; for(auto && h : reverse) diff --git a/src/compare.cpp b/src/compare.cpp index 317bc69..0c50cf9 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -742,9 +742,9 @@ void do_speed(std::vector sequence_files, range_arguments else if (args.hybrid & (args.order == 3)) speed(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max),create_name(args), args); else if (args.minstrobers & (args.order == 2)) - speed(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::minstrobe(args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.minstrobers & (args.order == 3)) - speed(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::minstrobe(true, args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.rand & (args.order == 2)) speed(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); else if (args.rand & (args.order == 3)) diff --git a/test/api/minstrobe_test.cpp b/test/api/minstrobe_test.cpp index 088bd0c..9a301ee 100644 --- a/test/api/minstrobe_test.cpp +++ b/test/api/minstrobe_test.cpp @@ -19,12 +19,13 @@ using seqan3::operator""_dna4; using seqan3::operator""_shape; -using result_t = std::vector>; +using result_t = std::vector; inline static constexpr auto kmer_view = seqan3::views::kmer_hash(seqan3::ungapped{4}); inline static constexpr auto gapped_kmer_view = seqan3::views::kmer_hash(0b1001_shape); -inline static constexpr auto minstrobe_view = seqan3::views::minstrobe(2,4); +inline static constexpr auto minstrobe_view = seqan3::views::minstrobe(2,4,4); +inline static constexpr auto minstrobe_view_gapped = seqan3::views::minstrobe(2,4,2); using iterator_type = std::ranges::iterator_t< decltype(std::declval() | kmer_view @@ -32,7 +33,7 @@ using iterator_type = std::ranges::iterator_t< decltype(std::declval() | kmer_view),3> - {std::declval() | kmer_view, 1, 3})>; + {std::declval() | kmer_view, 1, 3, 4})>; template <> struct iterator_fixture : public ::testing::Test @@ -42,10 +43,10 @@ struct iterator_fixture : public ::testing::Test seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{{26,97},{105,27},{166,27},{152,27},{97,27},{134,111}}; + result_t expected_range{6753,26907,42523,38939,24859,34415}; - decltype(seqan3::views::minstrobe(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 2, 4)) test_range = - seqan3::views::minstrobe(vec, 2, 4); + decltype(seqan3::views::minstrobe(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 2, 4, 4)) test_range = + seqan3::views::minstrobe(vec, 2, 4, 4); }; template <> @@ -56,11 +57,11 @@ struct iterator_fixture : public ::testing::Test seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{{26,105,27},{105,97,27},{166,97,27},{152,27,111},{97,27,191}}; + result_t expected_range{1730843,6906139,10903835,9968495,6364095}; decltype(seqan3::detail::minstrobe_view - (seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 3)) test_range = - seqan3::detail::minstrobe_view(vec, 1, 3); + (seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 3, 4)) test_range = + seqan3::detail::minstrobe_view(vec, 1, 3, 4); }; using test_types = ::testing::Types; @@ -70,18 +71,14 @@ template class minstrobe_view_properties_test: public ::testing::Test { }; using underlying_range_types = ::testing::Types, - std::vector const, - // seqan3::bitpacked_sequence, - // seqan3::bitpacked_sequence const, - std::list, - std::list const>; + std::vector const>; TYPED_TEST_SUITE(minstrobe_view_properties_test, underlying_range_types, ); class minstrobe_test : public ::testing::Test { protected: std::vector text1{"AAAAAAAAAAAA"_dna4}; - result_t result1{{0,0},{0,0},{0,0},{0,0}}; // Same result for ungapped and gapped + result_t result1{0,0,0,0}; // Same result for ungapped and gapped std::vector text3{"ACGGCGACGTTTAG"_dna4}; // kmers: ACGG, CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG @@ -93,27 +90,27 @@ class minstrobe_test : public ::testing::Test // stop at T gapped minstrobes: A--GC--C // start at A ungapped minstrobes: GCGAACGT, CGACACGT, GACGCGTT // start at A gapped minstrobes: G--AA--T, C--CA--T, G--GC--T - result_t result3_ungapped{{26,97},{105,27},{166,27},{152,27},{97,27},{134,111}}; - result_t result3_gapped{{2,5},{5,3},{10,3},{8,3},{5,3},{10,7}}; - result_t result3_ungapped_stop{{26,97}}; - result_t result3_gapped_stop{{2,5}}; - result_t result3_ungapped_start{{152,27},{97,27},{134,111}}; - result_t result3_gapped_start{{8,3},{5,3},{10,7}}; + result_t result3_ungapped{6753,26907,42523,38939,24859,34415}; + result_t result3_gapped{37,83,163,131,83,167}; + result_t result3_ungapped_stop{6753}; + result_t result3_gapped_stop{37}; + result_t result3_ungapped_start{38939,24859,34415}; + result_t result3_gapped_start{131,83,167}; - result_t result3_1{{0,0,0},{0,0,0},{0,0,0}}; // Same result for ungapped and gapped + result_t result3_1{0,0,0}; // Same result for ungapped and gapped // ACGGCGACGTTTAG // kmers: ACGG, CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG // ungapped Hashes: 26, 105, 166, 152, 97, 134, 27, 111, 191, 252, 242 // gapped Hashes: 2, 5, 10, 8, 5, 10, 3, 7, 11, 12, 14 - // ungapped minstrobes: ACGGCGGCACGT, CGGCCGACACGT, GGCGCGACACGT, GCGAACGTCGTT, CGACACGTCGTT + // ungapped minstrobes: ACGGCGGCACGT, CGGCCGACACGT, GGCGCGACACGT, GCGAACGTCGTT, CGACACGTGTTT // gapped minstrobes: A--GC--CA--T, C--CC--CA--T, G--GC--CA--T, G--AA--TC--T, C--CA--TG--T // start at A ungapped minstrobes: GGCGCGACACGT, GCGAACGTCGTT, CGACACGTCGTT // start at A gapped minstrobes: G--GC--CA--T, G--AA--TC--T, C--CA--TG--T - result_t result3_3_ungapped{{26,105,27},{105,97,27},{166,97,27},{152,27,111},{97,27,191}}; - result_t result3_3_gapped{{2,5,3},{5,5,3},{10,5,3},{8,3,7},{5,3,11}}; - result_t result3_3_ungapped_start{{152,27,111},{97,27,191}}; - result_t result3_3_gapped_start{{8,3,7},{5,3,11}}; + result_t result3_3_ungapped{1730843,6906139,10903835,9968495,6364095}; + result_t result3_3_gapped{595,1363,2643,2103,1339}; + result_t result3_3_ungapped_start{9968495,6364095}; + result_t result3_3_gapped_start{2103,1339}; }; template @@ -129,6 +126,7 @@ void compare_types(adaptor_t v) EXPECT_FALSE((std::ranges::output_range)); } + TYPED_TEST(minstrobe_view_properties_test, concepts) { TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, @@ -137,7 +135,7 @@ TYPED_TEST(minstrobe_view_properties_test, concepts) auto v = text | kmer_view | minstrobe_view; compare_types(v); - auto v2 = seqan3::detail::minstrobe_view(text | kmer_view,1,3); + auto v2 = seqan3::detail::minstrobe_view(text | kmer_view,1,3,4); compare_types(v2); } @@ -145,15 +143,15 @@ TYPED_TEST(minstrobe_view_properties_test, different_inputs_kmer_hash) { TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG - result_t ungapped{{27,97},{109,27},{182,27},{216,27},{97,27},{134,111}}; - result_t gapped{{3,5},{5,3},{10,3},{12,3},{5,3},{10,7}}; + result_t ungapped{7009,27931,46619,55323,24859,34415}; + result_t gapped{53,83,163,195,83,167}; EXPECT_RANGE_EQ(ungapped, text | kmer_view | minstrobe_view); - EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | minstrobe_view); + EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | minstrobe_view_gapped); - result_t ungapped3{{27,109,27},{109,97,27},{182,97,27},{216,27,111},{97,27,191}}; - result_t gapped3{{3,5,3},{5,5,3},{10,5,3},{12,3,7},{5,3,11}}; - EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::minstrobe_view(text | kmer_view,1,3))); - EXPECT_RANGE_EQ(gapped3, (seqan3::detail::minstrobe_view(text | gapped_kmer_view,1,3))); + result_t ungapped3{1797403,7168283,11952411,14162799,6364095}; + result_t gapped3{851,1363,2643,3127,1339}; + EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::minstrobe_view(text | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(gapped3, (seqan3::detail::minstrobe_view(text | gapped_kmer_view,1,3,2))); } TEST_F(minstrobe_test, ungapped_kmer_hash) @@ -161,27 +159,27 @@ TEST_F(minstrobe_test, ungapped_kmer_hash) EXPECT_RANGE_EQ(result1, text1 | kmer_view | minstrobe_view); EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | minstrobe_view); - EXPECT_RANGE_EQ(result3_1, (seqan3::detail::minstrobe_view(text1 | kmer_view,1,3))); - EXPECT_RANGE_EQ(result3_3_ungapped, (seqan3::detail::minstrobe_view(text3 | kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_1, (seqan3::detail::minstrobe_view(text1 | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(result3_3_ungapped, (seqan3::detail::minstrobe_view(text3 | kmer_view,1,3,4))); } TEST_F(minstrobe_test, gapped_kmer_hash) { - EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | minstrobe_view); - EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | minstrobe_view); + EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | minstrobe_view_gapped); + EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | minstrobe_view_gapped); - EXPECT_RANGE_EQ(result3_1, (seqan3::detail::minstrobe_view(text1 | gapped_kmer_view,1,3))); - EXPECT_RANGE_EQ(result3_3_gapped, (seqan3::detail::minstrobe_view(text3 | gapped_kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_1, (seqan3::detail::minstrobe_view(text1 | gapped_kmer_view,1,3,2))); + EXPECT_RANGE_EQ(result3_3_gapped, (seqan3::detail::minstrobe_view(text3 | gapped_kmer_view,1,3,2))); } TEST_F(minstrobe_test, combinability) { auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | kmer_view | minstrobe_view); - EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | minstrobe_view); + EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | minstrobe_view_gapped); - EXPECT_RANGE_EQ(result3_3_ungapped_start, (seqan3::detail::minstrobe_view(text3 | start_at_a | kmer_view,1,3))); - EXPECT_RANGE_EQ(result3_3_gapped_start, (seqan3::detail::minstrobe_view(text3 | start_at_a | gapped_kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_3_ungapped_start, (seqan3::detail::minstrobe_view(text3 | start_at_a | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(result3_3_gapped_start, (seqan3::detail::minstrobe_view(text3 | start_at_a | gapped_kmer_view,1,3,2))); /*auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); EXPECT_RANGE_EQ(result3_ungapped_stop, text3 | stop_at_t | kmer_view | minstrobe_view); From bea5fd02792308e0627caa65925de756c5ab751d Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Wed, 25 Jan 2023 17:34:43 +0100 Subject: [PATCH 09/34] [MISC] Make strobemers fast and adapt speed snakefile. --- include/hybridstrobe.hpp | 110 +++-- include/hybridstrobe.hpp.save | 739 ------------------------------ include/hybridstrobe_hash.hpp | 20 +- include/minstrobe.hpp | 16 +- include/randstrobe.hpp | 118 +++-- include/randstrobe_hash.hpp | 20 +- include/shared.hpp | 12 - src/compare.cpp | 12 +- src/snakemake/speed/Snakefile | 64 ++- src/snakemake/speed/plot_speed.py | 32 +- test/api/hybridstrobe_test.cpp | 69 +-- test/api/randstrobe_test.cpp | 77 ++-- 12 files changed, 332 insertions(+), 957 deletions(-) delete mode 100644 include/hybridstrobe.hpp.save diff --git a/include/hybridstrobe.hpp b/include/hybridstrobe.hpp index 0a5f886..94d7551 100644 --- a/include/hybridstrobe.hpp +++ b/include/hybridstrobe.hpp @@ -21,6 +21,8 @@ #include #include +#include "shared.hpp" + namespace seqan3::detail { // --------------------------------------------------------------------------------------------------------------------- @@ -60,6 +62,9 @@ class hybridstrobe_view : public std::ranges::view_interface class basic_iterator; @@ -84,11 +89,13 @@ class hybridstrobe_view : public std::ranges::view_interface //!\cond requires (std::ranges::viewable_range && std::constructible_from>>) //!\endcond - hybridstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size) : + hybridstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) : urange{std::views::all(std::forward(urange))}, window_dist{window_dist}, - window_size{window_size} + window_size{window_size}, + multi{multi} {} /*!\name Iterators @@ -132,7 +141,8 @@ class hybridstrobe_view : public std::ranges::view_interface::basic_iterator //!\brief Type for distances between iterators. using difference_type = typename std::iter_difference_t; //typename std::ranges::range_difference_t; //!\brief Value type of the iterator. - using value_t = std::ranges::range_value_t; - //!\brief Value type of the output. - using value_type = std::vector; + using value_type = std::ranges::range_value_t; //!\brief The pointer type. using pointer = void; //!\brief Reference to `value_type`. @@ -233,7 +242,9 @@ class hybridstrobe_view::basic_iterator third_iterator{std::move(it.third_iterator)}, urng_sentinel{std::move(it.urng_sentinel)}, window_dist{std::move(it.window_dist)}, - window_size{std::move(it.window_size)} + window_size{std::move(it.window_size)}, + multiplicator{std::move(it.multiplicator)}, + multiplicator3{std::move(it.multiplicator3)} {} @@ -243,13 +254,14 @@ class hybridstrobe_view::basic_iterator * \param[in] urng_sentinel Iterator pointing to the last position of the underlying range. * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. - * + * \param[in] power_multi The multiplicator. * */ basic_iterator(urng_iterator_t first_iterator, urng_sentinel_t urng_sentinel, size_t window_dist, - size_t window_size) : + size_t window_size, + uint64_t power_multi) : first_iterator{first_iterator}, second_iterator{first_iterator}, third_iterator{first_iterator}, @@ -270,6 +282,16 @@ class hybridstrobe_view::basic_iterator throw std::invalid_argument{"The given window size is too small.\n" "Please choose a bigger window size greater than 0."}; + if constexpr (order_3) + { + multiplicator = my_pow(4, power_multi*2); + multiplicator3 = my_pow(4, power_multi); + } + else + { + multiplicator = my_pow(4, power_multi); + } + elem_r = std::ceil(window_size/3.0); fill_window(); determine_value(); @@ -440,10 +462,10 @@ class hybridstrobe_view::basic_iterator urng_sentinel_t urng_sentinel{}; //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current hybridstrobe. - std::deque window_values{}; + std::deque window_values{}; //!\brief Stored values per window for order 3. - std::deque window_values3{}; + std::deque window_values3{}; //!\brief The result of the mod operation, indicates, which part of a window to consider (sub-window). int r_pos{}; @@ -457,6 +479,12 @@ class hybridstrobe_view::basic_iterator //!\brief The number of elements in a window. size_t window_size{}; + //!\brief The multiplicator. + uint64_t multiplicator{}; + + //!\brief The multiplicator for order 3. + uint64_t multiplicator3{}; + //!\brief Advances the window of the iterators to the next position. void advance_windows() { @@ -504,15 +532,15 @@ class hybridstrobe_view::basic_iterator void determine_value() { r_pos = *first_iterator % 3; - auto hybridstrobe_it = std::ranges::min_element(window_values.begin() + (r_pos*elem_r), std::ranges::next(window_values.begin(),((1+r_pos)*elem_r), window_values.end()), std::less_equal{}); + auto hybridstrobe_it = std::ranges::min_element(window_values.begin() + (r_pos*elem_r), std::ranges::next(window_values.begin(),((1+r_pos)*elem_r), window_values.end()), std::less_equal{}); if constexpr(order_3) { - auto hybridstrobe_it3 = std::ranges::min_element(window_values3.begin() + (r_pos*elem_r), std::ranges::next(window_values3.begin(), ((1+r_pos)*elem_r), window_values3.end()), std::less_equal{}); - hybridstrobe_value = {*first_iterator, *hybridstrobe_it, *hybridstrobe_it3}; + auto hybridstrobe_it3 = std::ranges::min_element(window_values3.begin() + (r_pos*elem_r), std::ranges::next(window_values3.begin(), ((1+r_pos)*elem_r), window_values3.end()), std::less_equal{}); + hybridstrobe_value = *first_iterator*multiplicator +*hybridstrobe_it*multiplicator3 + *hybridstrobe_it3; } else { - hybridstrobe_value = {*first_iterator, *hybridstrobe_it}; + hybridstrobe_value = *first_iterator*multiplicator + *hybridstrobe_it; } } @@ -547,11 +575,11 @@ class hybridstrobe_view::basic_iterator //!\brief A deduction guide for the view class template. template -hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> hybridstrobe_view>; +hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size, uint64_t const multi) -> hybridstrobe_view>; //!\brief A deduction guide for the view class template. template -hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> hybridstrobe_view, ord>; +hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size, uint64_t const multi) -> hybridstrobe_view, ord>; // --------------------------------------------------------------------------------------------------------------------- // hybridstrobe_fn (adaptor definition) @@ -563,9 +591,15 @@ hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) struct hybridstrobe_fn { //!\brief Store the number of values in two windows and return a range adaptor closure object. - constexpr auto operator()(const size_t window_dist, const size_t window_size) const + constexpr auto operator()(bool order3, const size_t window_dist, const size_t window_size, uint64_t const multi) const + { + return adaptor_from_functor{*this, window_dist, window_size, multi, order3}; + } + + //!\brief Store the number of values in two windows and return a range adaptor closure object. + constexpr auto operator()(const size_t window_dist, const size_t window_size, uint64_t const multi) const { - return adaptor_from_functor{*this, window_dist, window_size}; + return adaptor_from_functor{*this, window_dist, window_size, multi}; } /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower @@ -575,21 +609,40 @@ struct hybridstrobe_fn * std::ranges::forward_range. * \param[in] window_dist The offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. - * \returns A range of the converted values in vectors of size 2. + * \param[in] multi The multiplicator. + * \returns A range of the converted values. */ template - constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size) const + constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) const { static_assert(std::ranges::viewable_range, "The range parameter to views::hybridstrobe cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::hybridstrobe must model std::ranges::forward_range."); - if (window_size <= window_dist) - throw std::invalid_argument{"The chosen min and max windows are not valid." - "Please choose a window_size greater than window_dist."}; + return hybridstrobe_view{urange, window_dist, window_size, multi}; + } + + /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower + * offset and another integer indicating the upper offset of the second window. + * \tparam urng_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_dist The offset for the position of the next window from the previous one. + * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator. + * \param[in] order3 Use, if order 3 is wanted. TODO: The actual value does not matter. but make distinction between orders so much easier. + * \returns A range of the converted values. + */ + template + constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi, bool order3) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::hybridstrobe cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::hybridstrobe must model std::ranges::forward_range."); - return hybridstrobe_view{urange, window_dist, window_size}; + return hybridstrobe_view{urange, window_dist, window_size, multi}; } }; //![adaptor_def] @@ -605,6 +658,7 @@ namespace seqan3::views * \param[in] urange The range being processed. [parameter is omitted in pipe notation] * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator used to combine strobes. Should be the shape.count(). * \returns A range of std::totally_ordered where each value is a vector of size 2. See below for the * properties of the returned range. * \ingroup search_views diff --git a/include/hybridstrobe.hpp.save b/include/hybridstrobe.hpp.save deleted file mode 100644 index 94dde1e..0000000 --- a/include/hybridstrobe.hpp.save +++ /dev/null @@ -1,739 +0,0 @@ -// ----------------------------------------------------------------------------------------------------- -// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin -// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik -// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License -// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md -// ----------------------------------------------------------------------------------------------------- - -/*!\file - * \author Mitra Darvish - * \brief Provides hybridstrobe. - */ - -#pragma once - -#include -#include - -#include -#include -#include -#include -#include - -#include - -namespace seqan3::detail -{ -// --------------------------------------------------------------------------------------------------------------------- -// hybridstrobe_view class -// --------------------------------------------------------------------------------------------------------------------- - -/*!\brief The type returned by hybridstrobe. - * \tparam urng_t The type of the underlying range, must model std::ranges::forward_range, the reference type must - * model std::totally_ordered. The typical use case is that the reference type is the result of - * seqan3::kmer_hash. - * \tparam order The order of strobemers, at the moment the order 2 and 3 are supported. Default is 2. - * \implements std::ranges::view - * \ingroup search_views - * - * - * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. - */ -template -class hybridstrobe_view : public std::ranges::view_interface> -{ -private: - static_assert(std::ranges::forward_range, "The hybridstrobe_view only works on forward_ranges."); - static_assert(std::totally_ordered>, - "The reference type of the underlying range must model std::totally_ordered."); - - //!\brief Whether the given ranges are const_iterable. - static constexpr bool const_iterable = seqan3::const_iterable_range; - - //!\brief Whether the given order is of range 3. - static constexpr bool order_3 = (order == 3); - - //!\brief The underlying range. - urng_t urange{}; - - //!\brief The distance of the second strobe to the first one. - size_t window_dist{}; - - //!\brief The number of elements in a window. - size_t window_size{}; - - template - class basic_iterator; - - //!\brief The sentinel type of the hybridstrobe_view. - using sentinel = std::default_sentinel_t; - -public: - /*!\name Constructors, destructor and assignment - * \{ - */ - /// \cond Workaround_Doxygen - hybridstrobe_view() requires std::default_initializable = default; //!< Defaulted. - /// \endcond - hybridstrobe_view(hybridstrobe_view const & rhs) = default; //!< Defaulted. - hybridstrobe_view(hybridstrobe_view && rhs) = default; //!< Defaulted. - hybridstrobe_view & operator=(hybridstrobe_view const & rhs) = default; //!< Defaulted. - hybridstrobe_view & operator=(hybridstrobe_view && rhs) = default; //!< Defaulted. - ~hybridstrobe_view() = default; //!< Defaulted. - - /*!\brief Construct from a view and the two (lower and upper) offsets of the second window. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_dist The lower offset for the position of the next window from the previous one. - * \param[in] window_size The number of elements in a window. - */ - hybridstrobe_view(urng_t urange, size_t const window_dist, size_t const window_size) : - urange{std::move(urange)}, - window_dist{window_dist}, - window_size{window_size} - {} - - /*!\brief Construct from a non-view that can be view-wrapped and the two (lower and upper) offsets - * of the second window. - * \tparam other_urng_t The type of another urange. Must model std::ranges::viewable_range and be - constructible from urng_t. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_dist The lower offset for the position of the next window from the previous one. - * \param[in] window_size The number of elements in a window. - */ - template - //!\cond - requires (std::ranges::viewable_range && - std::constructible_from>>) - //!\endcond - hybridstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size) : - urange{std::views::all(std::forward(urange))}, - window_dist{window_dist}, - window_size{window_size} - {} - - /*!\name Iterators - * \{ - */ - /*!\brief Returns an iterator to the first element of the range. - * \returns Iterator to the first element. - * - * \details - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * Strong exception guarantee. - */ - basic_iterator begin() - { - return {std::ranges::begin(urange), - std::ranges::end(urange), - window_dist, - window_size}; - } - - //!\copydoc begin() - basic_iterator begin() const - //!\cond - requires const_iterable - //!\endcond - { - return {std::ranges::cbegin(urange), - std::ranges::cend(urange), - window_dist, - window_size}; - } - - /*!\brief Returns an iterator to the element following the last element of the range. - * \returns Iterator to the end. - * - * \details - * - * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. - * - * ### Complexity - * - * Constant. - * - * ### Exceptions - * - * No-throw guarantee. - */ - sentinel end() const - { - return {}; - } - /*basic_iterator end() - { - return {std::ranges::begin(urange), - std::ranges::end(urange), - window_dist, - window_size, - true}; - } - basic_iterator end() const - requires const_iterable -{ - return {std::ranges::cbegin(urange), - std::ranges::cend(urange), - window_dist, - window_size, - true}; - }*/ - //!\} -}; - -//!\brief Iterator for calculating hybridstrobes. -template -template -class hybridstrobe_view::basic_iterator -{ -private: - //!\brief The sentinel type of the underlying range. - using urng_sentinel_t = maybe_const_sentinel_t; - //!\brief The iterator type of the underlying range. - using urng_iterator_t = maybe_const_iterator_t; - - template - friend class basic_iterator; - -public: - /*!\name Associated types - * \{ - */ - //!\brief Type for distances between iterators. - using difference_type = std::ranges::range_difference_t; - //!\brief Value type of the iterator. - using value_t = std::ranges::range_value_t; - //!\brief Value type of the output. - using value_type = std::vector; - //!\brief The pointer type. - using pointer = void; - //!\brief Reference to `value_type`. - using reference = value_type; - //!\brief Tag this class as a bidirectional iterator. - using iterator_category = std::bidirectional_iterator_tag; - //!\brief Tag this class as a bidirectional iterator. - using iterator_concept = iterator_category; - //!\} - - /*!\name Constructors, destructor and assignment - * \{ - */ - basic_iterator() = default; //!< Defaulted. - basic_iterator(basic_iterator const &) = default; //!< Defaulted. - basic_iterator(basic_iterator &&) = default; //!< Defaulted. - basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. - basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. - ~basic_iterator() = default; //!< Defaulted. - - //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. - basic_iterator(basic_iterator const & it) - //!\cond - requires const_range - //!\endcond - : hybridstrobe_value{std::move(it.hybridstrobe_value)}, - first_iterator{std::move(it.first_iterator)}, - second_iterator{std::move(it.second_iterator)}, - third_iterator{std::move(it.third_iterator)}, - second_iterator_back{std::move(it.second_iterator_back)}, - third_iterator_back{std::move(it.third_iterator_back)}, - urng_sentinel{std::move(it.urng_sentinel)} - - {} - - /*!\brief Construct from two begin and one end iterators of a given range over std::totally_ordered values, and the two - * (lower and upper) offsets of the second window. - * \param[in] second_iterator Iterator pointing to the first position of the second window of the std::totally_ordered range. - * \param[in] third_iterator Iterator pointing to the first position of the third window of the std::totally_ordered range. - * \param[in] urng_sentinel Iterator pointing to the last position of the second window of the std::totally_ordered range. - * \param[in] window_dist The lower offset for the position of the next window from the previous one. - * \param[in] window_size The number of elements in a window. - * - * \details - * - * Looks at the number of values per two windows with three iterators. First iterator adds the next value in the vector as - * the first strobe. The second iterator adds the minimum value of the second window to the second position of the vector. - * The third iterator adds the minimum value of the third window to the third position of the vector. - * - */ - basic_iterator(urng_iterator_t second_iterator, - urng_sentinel_t urng_sentinel, - size_t window_dist, - size_t window_size) : - first_iterator{second_iterator}, - second_iterator{second_iterator}, - third_iterator{second_iterator}, - second_iterator_back{second_iterator}, - third_iterator_back{second_iterator}, - urng_sentinel{std::move(urng_sentinel)} - { - size_t size{}; - if constexpr(order_3) - { - size = std::ranges::distance(third_iterator, urng_sentinel); - } - else - { - size = std::ranges::distance(second_iterator, urng_sentinel); - } - - if (window_size + 1 > size) - throw std::invalid_argument{"The given sequence is too short to satisfy the given parameters.\n" - "Please choose a smaller window min and size."}; - // Throws, if the second group is not as big as the first group - if (std::ceil((window_size - std::ceil(window_size/3.0))/2.0) != std::ceil(window_size/3.0)) - throw std::invalid_argument{"The given window size is too short.\n" - "Please choose a bigger window size."}; - - auto check_end = first_iterator; - auto check_end2 = first_iterator; ---check_end; -++check_end2; -seqan3::debug_stream << "Check " << *first_iterator /*<< " " << *check_end << " " << check_end2*/ <<"\n"; - std::cout << "Check end: " << (check_end == urng_sentinel) << " " << (check_end2 == urng_sentinel) <<"\n"; - if constexpr (reverse) - window_first_from_end(window_dist, window_size); - else - window_first(window_dist, window_size); - } - //!\} - - //!\anchor basic_iterator_comparison_hybridstrobe - //!\name Comparison operators - //!\{ - - //!\brief Compare to another basic_iterator. - friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) - { - return (lhs.first_iterator == rhs.first_iterator) && - (lhs.second_iterator == rhs.second_iterator) && - (lhs.third_iterator == rhs.third_iterator); - } - - //!\brief Compare to another basic_iterator. - friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) - { - return !(lhs == rhs); - } - - //!\brief Compare to the sentinel of the hybridstrobe_view. - friend bool operator==(basic_iterator const & lhs, sentinel const &) - { - if constexpr(order_3) - return lhs.third_iterator == lhs.urng_sentinel; - else - return lhs.second_iterator == lhs.urng_sentinel; - } - - //!\brief Compare to the sentinel of the hybridstrobe_view. - friend bool operator==(sentinel const & lhs, basic_iterator const & rhs) - { - return rhs == lhs; - } - - //!\brief Compare to the sentinel of the hybridstrobe_view. - friend bool operator!=(sentinel const & lhs, basic_iterator const & rhs) - { - return !(lhs == rhs); - } - - //!\brief Compare to the sentinel of the hybridstrobe_view. - friend bool operator!=(basic_iterator const & lhs, sentinel const & rhs) - { - return !(lhs == rhs); - } - //!\} - - //!\brief Pre-increment. - basic_iterator & operator++() noexcept - { - next_hybridstrobe(); - return *this; - } - - //!\brief Post-increment. - basic_iterator operator++(int) noexcept - { - basic_iterator tmp{*this}; - next_hybridstrobe(); - return tmp; - } - - /*!\brief Pre-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator & operator--() noexcept - requires std::ranges::bidirectional_range - { - prev_hybridstrobe(); - return *this; - } - - /*!\brief Post-decrement. - * \attention This function is only available if underlying range is bidirectional. - */ - basic_iterator operator--(int) noexcept - requires std::ranges::bidirectional_range - { - basic_iterator tmp{*this}; - prev_hybridstrobe(); - return tmp; - } - - //!\brief Return the hybridstrobe. - value_type operator*() const noexcept - { - return hybridstrobe_value; - } - -private: - //!\brief The hybridstrobe value. - value_type hybridstrobe_value{}; - - //!\brief The offset relative to the beginning of the window where the hybridstrobe value is found. - size_t hybridstrobe_position_offset{}; - - //!\brief The offset relative to the beginning of the window where the hybridstrobe value is found. - size_t hybridstrobe_position_offset3{}; - - //!\brief Iterator to the first strobe of hybridstrobe. - urng_iterator_t first_iterator{}; - - //!\brief Iterator to the right most value of the window and hence the second strobe of hybridstrobe. - urng_iterator_t second_iterator{}; - - //!\brief Iterator to the right most value of the window and hence the third strobe of hybridstrobe. - urng_iterator_t third_iterator{}; - - //!\brief Iterator to the left most value of the window and hence the second strobe of minstrobe for bidirectionality. - urng_iterator_t second_iterator_back{}; - - //!\brief Iterator to the left most value of the window and hence the third strobe of minstrobe for bidirectionality. - urng_iterator_t third_iterator_back{}; - - //!\brief Iterator to first element in range. - urng_iterator_t urng_first{}; - - //!\brief Iterator to last element in range. - urng_sentinel_t urng_sentinel{}; - - //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current hybridstrobe. - std::deque window_values{}; - - //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current hybridstrobe. - std::deque window_values3{}; - - int r_pos{}; - int elem_r{}; - bool is_first = true; - - //!\brief Advances the window of the iterators to the next position. - void advance_windows() - { - ++first_iterator; - ++second_iterator; - ++second_iterator_back; - - if constexpr(order_3) - { - ++third_iterator; - ++third_iterator_back; - } - } - - //!\brief Retreat the window of the iterators to the next position. - void retreat_windows() - { - --first_iterator; - --second_iterator; - --second_iterator_back; - - if constexpr(order_3) - { - --third_iterator; - --third_iterator_back; - } - } - - //!\brief Calculates hybridstrobes for the first window. - void window_first(const size_t window_dist, const size_t window_size) - { - if (window_size == 0u) - return; - - urng_first = second_iterator; - first_iterator = second_iterator; - r_pos = *first_iterator % 3; - elem_r = std::ceil(window_size/3.0); - std::ranges::advance(second_iterator, window_dist); - second_iterator_back = second_iterator; -seqan3::debug_stream << "Window It:" << *first_iterator << " " << *second_iterator << " " << *second_iterator_back << "\n"; -if constexpr(order_3) - { - third_iterator = second_iterator; - third_iterator_back = second_iterator_back; - std::ranges::advance(third_iterator, window_dist + window_size - 1); - std::ranges::advance(third_iterator_back, window_dist + window_size - 1); - } - - for (int i = 1u; i < window_size; ++i) - { - window_values.push_back(*second_iterator); - ++second_iterator; - - if constexpr(order_3) - { - window_values3.push_back(*third_iterator); - ++third_iterator; - } - } - window_values.push_back(*second_iterator); -seqan3::debug_stream << "Window_values: " << window_values << "\n"; - auto hybridstrobe_it = std::ranges::min_element(window_values.begin() + (r_pos*elem_r),std::min(window_values.begin() + ((1+r_pos)*elem_r), window_values.end()), std::less_equal{}); - - if constexpr(order_3) - { - window_values3.push_back(*third_iterator); - auto hybridstrobe_it3 = std::ranges::min_element(window_values3.begin() + (r_pos*elem_r), std::min(window_values3.begin() + ((1+r_pos)*elem_r), window_values3.end()), std::less_equal{}); - hybridstrobe_value = {*first_iterator, *hybridstrobe_it, *hybridstrobe_it3}; - } - else - { - hybridstrobe_value = {*first_iterator, *hybridstrobe_it}; - } - - } -//!\brief Calculates hybridstrobes for the first window from the end. - void window_first_from_end(const size_t window_dist, const size_t window_size) - { - if (window_size == 0u) - return; - - urng_first = second_iterator; - first_iterator = second_iterator; - r_pos = *first_iterator % 3; - elem_r = std::ceil(window_size/3.0); - std::ranges::advance(second_iterator, window_dist); - second_iterator_back = second_iterator; - if constexpr(order_3) - { - third_iterator = second_iterator; - third_iterator_back = second_iterator_back; - std::ranges::advance(third_iterator, window_dist + window_size - 1); - std::ranges::advance(third_iterator_back, window_dist + window_size - 1); - } - - for (int i = 1u; i < window_size; ++i) - { - window_values.push_front(*second_iterator); - ++second_iterator; - - if constexpr(order_3) - { - window_values3.push_front(*third_iterator); - ++third_iterator; - } - } - window_values.push_front(*second_iterator); - - auto hybridstrobe_it = std::ranges::min_element(window_values.begin() + (r_pos*elem_r),std::min(window_values.begin() + ((1+r_pos)*elem_r), window_values.end()), std::less_equal{}); - - if constexpr(order_3) - { - window_values3.push_front(*third_iterator); - auto hybridstrobe_it3 = std::ranges::min_element(window_values3.begin() + (r_pos*elem_r), std::min(window_values3.begin() + ((1+r_pos)*elem_r), window_values3.end()), std::less_equal{}); - hybridstrobe_value = {*first_iterator, *hybridstrobe_it, *hybridstrobe_it3}; - } - else - { - hybridstrobe_value = {*first_iterator, *hybridstrobe_it}; - } - } - /*!\brief Calculates the next hybridstrobe value. - * \details - * For the following windows, we remove the first window value (is now not in window_values) and add the new - * value that results from the window shifting. - */ - void next_hybridstrobe() - { - advance_windows(); - is_first = false; - r_pos = *first_iterator % 3; - - if constexpr(order_3) - { - if (third_iterator == urng_sentinel) - return; - } - else - { - if (second_iterator == urng_sentinel) - return; - } - - hybridstrobe_value[0]= *first_iterator; - window_values.pop_front(); - window_values.push_back(*second_iterator); - auto hybridstrobe_it = std::ranges::min_element(window_values.begin() + (r_pos*elem_r), std::min(window_values.begin() + ((1+r_pos)*elem_r), window_values.end()), std::less_equal{}); - if constexpr(order_3) - { - window_values3.pop_front(); - window_values3.push_back(*third_iterator); - auto hybridstrobe_it3 = std::ranges::min_element(window_values3.begin() + (r_pos*elem_r), std::min(window_values3.begin() + ((1+r_pos)*elem_r), window_values3.end()), std::less_equal{}); - hybridstrobe_value = {*first_iterator, *hybridstrobe_it, *hybridstrobe_it3}; - } - else - { - hybridstrobe_value = {*first_iterator, *hybridstrobe_it}; - } - } - - /*!\brief Calculates the previous hybridstrobe value. - * \details - * For the following windows, we remove the last window value (is now not in window_values) and add the new - * value that results from the window shifting. - */ - void prev_hybridstrobe() - requires std::ranges::bidirectional_range - { - if (is_first) - return; - - retreat_windows(); - r_pos = *first_iterator % 3; - - hybridstrobe_value[0] = *first_iterator; - window_values.pop_back(); - window_values.push_front(*second_iterator_back); - auto hybridstrobe_it = std::ranges::min_element(window_values.begin() + (r_pos*elem_r), std::min(window_values.begin() + ((1+r_pos)*elem_r), window_values.end()), std::less_equal{}); - seqan3::debug_stream << "It: " << *first_iterator << "," << *hybridstrobe_it <<"\n"; - if constexpr(order_3) - { - window_values3.pop_back(); - window_values3.push_front(*third_iterator_back); - auto hybridstrobe_it3 = std::ranges::min_element(window_values3.begin() + (r_pos*elem_r), std::min(window_values3.begin() + ((1+r_pos)*elem_r), window_values3.end()), std::less_equal{}); - hybridstrobe_value = {*first_iterator, *hybridstrobe_it, *hybridstrobe_it3}; - } - else - { - hybridstrobe_value = {*first_iterator, *hybridstrobe_it}; - } - - if (second_iterator_back == urng_first) - is_first = true; - if constexpr(order_3) - { - if (third_iterator_back == urng_first) - is_first = true; - } - } -}; - - - -//!\brief A deduction guide for the view class template. -template -hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> hybridstrobe_view>; - -//!\brief A deduction guide for the view class template. -template -hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> hybridstrobe_view, ord>; - -//!\brief A deduction guide for the view class template. -template -hybridstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> hybridstrobe_view, ord, rev>; - -// --------------------------------------------------------------------------------------------------------------------- -// hybridstrobe_fn (adaptor definition) -// --------------------------------------------------------------------------------------------------------------------- - -//![adaptor_def] -//!\brief hybridstrobe's range adaptor object type (non-closure). -//!\ingroup search_views -struct hybridstrobe_fn -{ - //!\brief Store the number of values in two windows and return a range adaptor closure object. - constexpr auto operator()(const size_t window_dist, const size_t window_size) const - { - return adaptor_from_functor{*this, window_dist, window_size}; - } - - /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower - * offset and another integer indicating the upper offset of the second window. - * \tparam urng_t The type of the input range to process. Must model std::ranges::viewable_range. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and - * std::ranges::forward_range. - * \param[in] window_dist The lower offset for the position of the next window from the previous one. - * \param[in] window_size The number of elements in a window. - * \returns A range of the converted values in vectors of size 2. - */ - template - constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size) const - { - static_assert(std::ranges::viewable_range, - "The range parameter to views::hybridstrobe cannot be a temporary of a non-view range."); - static_assert(std::ranges::forward_range, - "The range parameter to views::hybridstrobe must model std::ranges::forward_range."); - - if (window_size <= window_dist) - throw std::invalid_argument{"The chosen min and max windows are not valid." - "Please choose a window_size greater than window_dist."}; - - return hybridstrobe_view{urange, window_dist, window_size}; - } -}; -//![adaptor_def] - -} // namespace seqan3::detail - -namespace seqan3::views -{ -/*!\brief Computes hybridstrobes for a range of comparable values. A hybridstrobe consists of a starting strobe - * concatenated with n−1 consecutively concatenated minimizers. - * \tparam urng_t The type of the range being processed. See below for requirements. [template - * parameter is omitted in pipe notation] - * \param[in] urange The range being processed. [parameter is omitted in pipe notation] - * \param[in] window_dist The lower offset for the position of the next window from the previous one. - * \param[in] window_size The number of elements in a window. - * \returns A range of std::totally_ordered where each value is a vector of size 2. See below for the - * properties of the returned range. - * \ingroup search_views - * - * \details - * - * A hybridstrobe defined by [Sahlin](https://genome.cshlp.org/content/31/11/2080.full.pdf) consists of - * a starting strobe concatenated with n−1 consecutively concatenated minimizers in their respective windows. - * For example for the following list of hash values `[6, 26, 41, 38, 24, 33, 6, 27, 47]` and 3 as `window_dist`, - * 4 as `window_size`, the hybridstrobe values are `[(6,24),(26,6),(41,6),(38,6)]`. - * - * ### View properties - * - * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | - * |----------------------------------|:----------------------------------:|:--------------------------------:| - * | std::ranges::input_range | *required* | *preserved* | - * | std::ranges::forward_range | *required* | *preserved* | - * | std::ranges::bidirectional_range | | *preserved* | - * | std::ranges::random_access_range | | *lost* | - * | std::ranges::contiguous_range | | *lost* | - * | | | | - * | std::ranges::viewable_range | *required* | *guaranteed* | - * | std::ranges::view | | *guaranteed* | - * | std::ranges::sized_range | | *lost* | - * | std::ranges::common_range | | *lost* | - * | std::ranges::output_range | | *lost* | - * | seqan3::const_iterable_range | | *preserved* | - * | | | | - * | std::ranges::range_reference_t | std::totally_ordered | std::totally_ordered | - * - * See the views views submodule documentation for detailed descriptions of the view properties. - */ -inline constexpr auto hybridstrobe = detail::hybridstrobe_fn{}; - -} // namespace seqan3::views diff --git a/include/hybridstrobe_hash.hpp b/include/hybridstrobe_hash.hpp index bb462a9..a8dea65 100644 --- a/include/hybridstrobe_hash.hpp +++ b/include/hybridstrobe_hash.hpp @@ -85,10 +85,7 @@ struct hybridstrobe2_hash_fn {return i ^ seed.get();}); - auto hybridstrobes = seqan3::detail::hybridstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - uint64_t multiplicator = my_pow(4, shape.count()); - auto forward = std::views::transform(hybridstrobes, [multiplicator] (std::vector i) - {return combine_strobes(multiplicator, i[0], i[1]);}); + auto forward = seqan3::detail::hybridstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); auto rev_hashed_values = std::forward(urange) | seqan3::views::complement | std::views::reverse @@ -97,9 +94,7 @@ struct hybridstrobe2_hash_fn {return i ^ seed.get();}); - auto rev_hybridstrobes = seqan3::detail::hybridstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - auto reverse = std::views::transform(rev_hybridstrobes, [multiplicator] (std::vector i) - {return combine_strobes(multiplicator, i[0], i[1]);}); + auto reverse = seqan3::detail::hybridstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); std::vector rev{}; for(auto && h : reverse) @@ -180,12 +175,7 @@ struct hybridstrobe3_hash_fn {return i ^ seed.get();}); - auto hybridstrobes = seqan3::detail::hybridstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - uint64_t multiplicator = my_pow(4, shape.count()*2); - uint64_t multiplicator2 = my_pow(4, shape.count()); - auto forward = std::views::transform(hybridstrobes, [multiplicator, multiplicator2] (std::vector i) - {return combine_strobes(multiplicator, multiplicator2, i[0], i[1], i[2]);}); - + auto forward = seqan3::detail::hybridstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); auto rev_hashed_values = std::forward(urange) | seqan3::views::complement | std::views::reverse @@ -194,9 +184,7 @@ struct hybridstrobe3_hash_fn {return i ^ seed.get();}); - auto rev_hybridstrobes = seqan3::detail::hybridstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - auto reverse = std::views::transform(rev_hybridstrobes, [multiplicator, multiplicator2] (std::vector i) - {return combine_strobes(multiplicator, multiplicator2, i[0], i[1], i[2]);}); + auto reverse = seqan3::detail::hybridstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); std::vector rev{}; for(auto && h : reverse) diff --git a/include/minstrobe.hpp b/include/minstrobe.hpp index 93161f4..ac132ba 100644 --- a/include/minstrobe.hpp +++ b/include/minstrobe.hpp @@ -242,7 +242,9 @@ class minstrobe_view::basic_iterator third_iterator{std::move(it.third_iterator)}, urng_sentinel{std::move(it.urng_sentinel)}, window_dist{std::move(it.window_dist)}, - window_size{std::move(it.window_size)} + window_size{std::move(it.window_size)}, + multiplicator{std::move(it.multiplicator)}, + multiplicator3{std::move(it.multiplicator3)} {} @@ -252,8 +254,7 @@ class minstrobe_view::basic_iterator * \param[in] urng_sentinel Iterator pointing to the last position of the underlying range. * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. - * - * + * \param[in] power_multi The multiplicator. */ basic_iterator(urng_iterator_t first_iterator, urng_sentinel_t urng_sentinel, @@ -642,9 +643,10 @@ struct minstrobe_fn * std::ranges::forward_range. * \param[in] window_dist The offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator. * \returns A range of the converted values in vectors of size 2. */ - template + template constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) const { static_assert(std::ranges::viewable_range, @@ -652,7 +654,7 @@ struct minstrobe_fn static_assert(std::ranges::forward_range, "The range parameter to views::minstrobe must model std::ranges::forward_range."); - return minstrobe_view{urange, window_dist, window_size, multi}; + return minstrobe_view{urange, window_dist, window_size, multi}; } /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower @@ -662,7 +664,9 @@ struct minstrobe_fn * std::ranges::forward_range. * \param[in] window_dist The offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. - * \returns A range of the converted values in vectors of size 2. + * \param[in] multi The multiplicator. + * \param[in] order3 Use, if order 3 is wanted. TODO: The actual value does not matter. but make distinction between orders so much easier. + * \returns A range of the converted values. */ template constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi, bool order3) const diff --git a/include/randstrobe.hpp b/include/randstrobe.hpp index d46f743..70dd9e2 100644 --- a/include/randstrobe.hpp +++ b/include/randstrobe.hpp @@ -21,6 +21,8 @@ #include #include +#include "shared.hpp" + namespace seqan3::detail { // --------------------------------------------------------------------------------------------------------------------- @@ -60,6 +62,9 @@ class randstrobe_view : public std::ranges::view_interface class basic_iterator; @@ -84,11 +89,13 @@ class randstrobe_view : public std::ranges::view_interface //!\cond requires (std::ranges::viewable_range && std::constructible_from>>) //!\endcond - randstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size) : + randstrobe_view(other_urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) : urange{std::views::all(std::forward(urange))}, window_dist{window_dist}, - window_size{window_size} + window_size{window_size}, + multi{multi} {} /*!\name Iterators @@ -132,7 +141,8 @@ class randstrobe_view : public std::ranges::view_interface::basic_iterator //!\brief Type for distances between iterators. using difference_type = typename std::iter_difference_t; //typename std::ranges::range_difference_t; //!\brief Value type of the iterator. - using value_t = std::ranges::range_value_t; - //!\brief Value type of the output. - using value_type = std::vector; + using value_type = std::ranges::range_value_t; //!\brief The pointer type. using pointer = void; //!\brief Reference to `value_type`. @@ -233,7 +242,9 @@ class randstrobe_view::basic_iterator third_iterator{std::move(it.third_iterator)}, urng_sentinel{std::move(it.urng_sentinel)}, window_dist{std::move(it.window_dist)}, - window_size{std::move(it.window_size)} + window_size{std::move(it.window_size)}, + multiplicator{std::move(it.multiplicator)}, + multiplicator3{std::move(it.multiplicator3)} {} @@ -243,13 +254,13 @@ class randstrobe_view::basic_iterator * \param[in] urng_sentinel Iterator pointing to the last position of the underlying range. * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. - * - * + * \param[in] power_multi The multiplicator. */ basic_iterator(urng_iterator_t first_iterator, urng_sentinel_t urng_sentinel, size_t window_dist, - size_t window_size) : + size_t window_size, + uint64_t power_multi) : first_iterator{first_iterator}, second_iterator{first_iterator}, third_iterator{first_iterator}, @@ -267,6 +278,15 @@ class randstrobe_view::basic_iterator throw std::invalid_argument{"The given window size is too small.\n" "Please choose a bigger window size greater than 0."}; + if constexpr (order_3) + { + multiplicator = my_pow(4, power_multi*2); + multiplicator3 = my_pow(4, power_multi); + } + else + { + multiplicator = my_pow(4, power_multi); + } fill_window(); } //!\} @@ -443,14 +463,20 @@ class randstrobe_view::basic_iterator //!\brief The bitmask. size_t bitmask{0x1C5C4AE}; + //!\brief The multiplicator. + uint64_t multiplicator{}; + + //!\brief The multiplicator for order 3. + uint64_t multiplicator3{}; + //!\brief Link two strobes. - value_t linking(value_t const & first, value_t const & second) + value_type linking(value_type const & first, value_type const & second) { return (first+second) &bitmask; } //!\brief Link three strobes. - value_t linking(value_t const & first, value_t const & second, value_t const & third) + value_type linking(value_type const & first, value_type const & second, value_type const & third) { return (first+second+third) & bitmask; } @@ -459,16 +485,16 @@ class randstrobe_view::basic_iterator void fill_window() { //!\brief Stores minimum for order 2. - value_t minimum{}; + value_type minimum{}; //!\brief Stores minimum for order 3. - value_t minimum3{}; + value_type minimum3{}; //!\brief Stores minimum hash value for order 2. - value_t minimum_hash{}; + value_type minimum_hash{}; //!\brief Stores minimum hash value for order 3. - value_t minimum_hash3{}; + value_type minimum_hash3{}; second_iterator = first_iterator; std::ranges::advance(second_iterator, window_dist); @@ -484,7 +510,7 @@ class randstrobe_view::basic_iterator for (int i = 1u; i < window_size; ++i) { ++second_iterator; - value_t new_value = linking(*first_iterator, *second_iterator); + value_type new_value = linking(*first_iterator, *second_iterator); if (new_value <= minimum_hash) { minimum_hash = new_value; @@ -500,7 +526,7 @@ class randstrobe_view::basic_iterator for (int i = 1u; i < window_size; ++i) { ++third_iterator; - value_t new_value = linking(*first_iterator, minimum, *third_iterator); + value_type new_value = linking(*first_iterator, minimum, *third_iterator); if (new_value <= minimum_hash3) { minimum_hash3 = new_value; @@ -508,11 +534,11 @@ class randstrobe_view::basic_iterator } } - randstrobe_value = {*first_iterator, minimum, minimum3}; + randstrobe_value = *first_iterator*multiplicator + minimum*multiplicator3 + minimum3; } else { - randstrobe_value = {*first_iterator, minimum}; + randstrobe_value = *first_iterator*multiplicator + minimum; } } @@ -538,11 +564,11 @@ class randstrobe_view::basic_iterator //!\brief A deduction guide for the view class template. template -randstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> randstrobe_view>; +randstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size, uint64_t const multi) -> randstrobe_view>; //!\brief A deduction guide for the view class template. template -randstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> randstrobe_view, ord>; +randstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size, uint64_t const multi) -> randstrobe_view, ord>; // --------------------------------------------------------------------------------------------------------------------- // randstrobe_fn (adaptor definition) @@ -554,9 +580,15 @@ randstrobe_view(rng_t &&, size_t const window_dist, size_t const window_size) -> struct randstrobe_fn { //!\brief Store the number of values in two windows and return a range adaptor closure object. - constexpr auto operator()(const size_t window_dist, const size_t window_size) const + constexpr auto operator()(bool order3, const size_t window_dist, const size_t window_size, uint64_t const multi) const + { + return adaptor_from_functor{*this, window_dist, window_size, multi, order3}; + } + + //!\brief Store the number of values in two windows and return a range adaptor closure object. + constexpr auto operator()(const size_t window_dist, const size_t window_size, uint64_t const multi) const { - return adaptor_from_functor{*this, window_dist, window_size}; + return adaptor_from_functor{*this, window_dist, window_size, multi}; } /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower @@ -566,21 +598,40 @@ struct randstrobe_fn * std::ranges::forward_range. * \param[in] window_dist The offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. - * \returns A range of the converted values in vectors of size 2. + * \param[in] multi The multiplicator. + * \returns A range of the converted values. */ template - constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size) const + constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi) const { static_assert(std::ranges::viewable_range, "The range parameter to views::randstrobe cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::randstrobe must model std::ranges::forward_range."); - if (window_size <= window_dist) - throw std::invalid_argument{"The chosen min and max windows are not valid." - "Please choose a window_size greater than window_dist."}; + return randstrobe_view{urange, window_dist, window_size, multi}; + } + + /*!\brief Call the view's constructor with three arguments: the underlying view and an integer indicating a lower + * offset and another integer indicating the upper offset of the second window. + * \tparam urng_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_dist The offset for the position of the next window from the previous one. + * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator. + * \param[in] order3 Use, if order 3 is wanted. TODO: The actual value does not matter. but make distinction between orders so much easier. + * \returns A range of the converted values. + */ + template + constexpr auto operator()(urng_t && urange, size_t const window_dist, size_t const window_size, uint64_t const multi, bool order3) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::randstrobe cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::randstrobe must model std::ranges::forward_range."); - return randstrobe_view{urange, window_dist, window_size}; + return randstrobe_view{urange, window_dist, window_size, multi}; } }; //![adaptor_def] @@ -596,6 +647,7 @@ namespace seqan3::views * \param[in] urange The range being processed. [parameter is omitted in pipe notation] * \param[in] window_dist The lower offset for the position of the next window from the previous one. * \param[in] window_size The number of elements in a window. + * \param[in] multi The multiplicator used to combine strobes. Should be the shape.count(). * \returns A range of std::totally_ordered where each value is a vector of size 2. See below for the * properties of the returned range. * \ingroup search_views diff --git a/include/randstrobe_hash.hpp b/include/randstrobe_hash.hpp index 26dd695..5cbac5c 100644 --- a/include/randstrobe_hash.hpp +++ b/include/randstrobe_hash.hpp @@ -85,10 +85,7 @@ struct randstrobe2_hash_fn {return i ^ seed.get();}); - auto randstrobes = seqan3::detail::randstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - uint64_t multiplicator = my_pow(4, shape.count()); - auto forward = std::views::transform(randstrobes, [multiplicator] (std::vector i) - {return combine_strobes(multiplicator, i[0], i[1]);}); + auto forward = seqan3::detail::randstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); auto rev_hashed_values = std::forward(urange) | seqan3::views::complement | std::views::reverse @@ -97,9 +94,7 @@ struct randstrobe2_hash_fn {return i ^ seed.get();}); - auto rev_randstrobes = seqan3::detail::randstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - auto reverse = std::views::transform(rev_randstrobes, [multiplicator] (std::vector i) - {return combine_strobes(multiplicator, i[0], i[1]);}); + auto reverse = seqan3::detail::randstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); std::vector rev{}; for(auto && h : reverse) @@ -181,12 +176,7 @@ struct randstrobe3_hash_fn {return i ^ seed.get();}); - auto randstrobes = seqan3::detail::randstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - uint64_t multiplicator = my_pow(4, shape.count()*2); - uint64_t multiplicator2 = my_pow(4, shape.count()); - auto forward = std::views::transform(randstrobes, [multiplicator, multiplicator2] (std::vector i) - {return combine_strobes(multiplicator, multiplicator2, i[0], i[1], i[2]);}); - + auto forward = seqan3::detail::randstrobe_view(hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); auto rev_hashed_values = std::forward(urange) | seqan3::views::complement | std::views::reverse @@ -194,9 +184,7 @@ struct randstrobe3_hash_fn | std::views::transform([seed] (uint64_t i) {return i ^ seed.get();}); - auto rev_randstrobes = seqan3::detail::randstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1); - auto reverse = std::views::transform(rev_randstrobes, [multiplicator, multiplicator2] (std::vector i) - {return combine_strobes(multiplicator, multiplicator2, i[0], i[1], i[2]);}); + auto reverse = seqan3::detail::randstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); std::vector rev{}; for(auto && h : reverse) diff --git a/include/shared.hpp b/include/shared.hpp index 24fe286..f0dc473 100644 --- a/include/shared.hpp +++ b/include/shared.hpp @@ -27,18 +27,6 @@ uint64_t fnv_hash(uint64_t hash_value, uint64_t seed) return hashed; } -//!\brief Function that combines strobes for strobemer hash functions. -uint64_t combine_strobes(uint64_t multiplicator, uint64_t first_strobe, uint64_t second_strobe) -{ - return first_strobe*multiplicator + second_strobe; -} - -//!\brief Function that combines strobes for strobemer hash functions. -uint64_t combine_strobes(uint64_t multiplicator, uint64_t multiplicator2, uint64_t first_strobe, uint64_t second_strobe, uint64_t third_strobe) -{ - return first_strobe*multiplicator + second_strobe*multiplicator2 + third_strobe; -} - //!\brief My own pow, which should be slightly faster than std::pow for n < 100. uint64_t my_pow(uint64_t x, uint64_t n){ uint64_t r = 1; diff --git a/src/compare.cpp b/src/compare.cpp index 0c50cf9..44a6908 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -494,7 +494,7 @@ void speed(std::vector sequence_files, urng_t input_view, start = std::chrono::high_resolution_clock::now(); get_strobemers(seq, args, strobes_vector); for (auto & t : strobes_vector) // iterate over the strobemer tuples - count += std::get<0>(t); + count += std::get<0>(t); end = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(end - start); speed_results.push_back(duration.count()); @@ -710,7 +710,7 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen } } - +// Note: Speed is based on non-canonical version! void do_speed(std::vector sequence_files, range_arguments & args) { switch(args.name) @@ -738,17 +738,17 @@ void do_speed(std::vector sequence_files, range_arguments else { if (args.hybrid & (args.order == 2)) - speed(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::hybridstrobe(args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.hybrid & (args.order == 3)) - speed(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max),create_name(args), args); + speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::hybridstrobe(true, args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.minstrobers & (args.order == 2)) speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::minstrobe(args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.minstrobers & (args.order == 3)) speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::minstrobe(true, args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.rand & (args.order == 2)) - speed(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::randstrobe(args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); else if (args.rand & (args.order == 3)) - speed(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::randstrobe(true, args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); } } } diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index d85c5ce..f277b21 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -25,15 +25,29 @@ rule plot: [shape + "_kmer_hash_30_speed.out" for shape in ["0", "805287931", "1004529051"]], [shape + "_kmer_hash_32_speed.out" for shape in ["0", "3169577727", "241004285"]], # 4 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in [9,12,15]], + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [9,12,15]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in [9,12,15]] + ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in [9,12,15]] + # 4 "gaps" + ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [9,12,15]], + # 8 "gaps" + ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], + ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]] shell: "python3 plot_speed.py" rule download_example_Data: @@ -54,17 +68,17 @@ rule speed_minstrobemer: input: "../results/simulated_10000.fa.gz" output: - "minstrobemers_{kmer_size}_2_{wmin}_{wmax}_speed.out" + "minstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_speed.out" shell: - "minions speed --method strobemer --min -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order 2 {input}" + "minions speed --method strobemer --min -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" rule speed_hybridstrobemer: input: "../results/simulated_10000.fa.gz" output: - "hybridstrobemers_{kmer_size}_2_{wmin}_{wmax}_speed.out" + "hybridstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_speed.out" shell: - "minions speed --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order 2 {input}" + "minions speed --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" rule speed_randstrobemer: input: @@ -72,4 +86,28 @@ rule speed_randstrobemer: output: "randstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_speed.out" shell: - "minions speed --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order {wildcards.order} {input}" + "minions speed --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" + +rule speed_minstrobemer_original: + input: + "../results/simulated_10000.fa.gz" + output: + "Original_minstrobemers_{kmer_size}_2_{wmin}_{wmax}_speed.out" + shell: + "minions speed --method strobemer --min -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order 2 {input} -o Original_" + +rule speed_hybridstrobemer_original: + input: + "../results/simulated_10000.fa.gz" + output: + "Original_hybridstrobemers_{kmer_size}_2_{wmin}_{wmax}_speed.out" + shell: + "minions speed --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order 2 {input} -o Original_" + +rule speed_randstrobemer_original: + input: + "../results/simulated_10000.fa.gz" + output: + "Original_randstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_speed.out" + shell: + "minions speed --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order {wildcards.order} {input} -o Original_" diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 1fd6057..3d83bd7 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -22,14 +22,14 @@ def read_file(results, files): gapped4_kmers = read_file([], [shapes4[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) shapes8 = ["51755","975475","13954519","241004285","241004285"] gapped8_kmers = read_file([], [shapes8[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) -minstrobemers = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17,2)]) -hybridstrobemers = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in [9,12,15]]) -minstrobemers8 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17,2)]) -hybridstrobemers8 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in range(8,17,2)]) -randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) +randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers8 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers8 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]]) # Plot comparison between k-mers fig = plt.figure() @@ -137,14 +137,14 @@ def read_file(results, files): pos_order3 = [1.25,4.25,7.25] strobe_range = [k for k in range(8,17)] -minstrobemers = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in strobe_range]) -randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in strobe_range]) -randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(12+k)+"_speed.out" for k in [9,12,15]]) -minstrobemers8 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers8 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in strobe_range]) -randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in strobe_range]) -randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(16+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) +randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [9,12,15]]) +minstrobemers8 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers8 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]]) fig = plt.figure() X = np.arange(len(k_size)) diff --git a/test/api/hybridstrobe_test.cpp b/test/api/hybridstrobe_test.cpp index 29b3ea1..3c7b9a7 100644 --- a/test/api/hybridstrobe_test.cpp +++ b/test/api/hybridstrobe_test.cpp @@ -19,12 +19,13 @@ using seqan3::operator""_dna4; using seqan3::operator""_shape; -using result_t = std::vector>; +using result_t = std::vector; inline static constexpr auto kmer_view = seqan3::views::kmer_hash(seqan3::ungapped{4}); inline static constexpr auto gapped_kmer_view = seqan3::views::kmer_hash(0b1001_shape); -inline static constexpr auto hybridstrobe_view = seqan3::views::hybridstrobe(1,5); +inline static constexpr auto hybridstrobe_view = seqan3::views::hybridstrobe(1,5,4); +inline static constexpr auto hybridstrobe_view_gapped = seqan3::views::hybridstrobe(1,5,2); using iterator_type = std::ranges::iterator_t< decltype(std::declval() | kmer_view @@ -32,7 +33,7 @@ using iterator_type = std::ranges::iterator_t< decltype(std::declval() | kmer_view),3> - {std::declval() | kmer_view, 1, 3})>; + {std::declval() | kmer_view, 1, 3, 4})>; template <> struct iterator_fixture : public ::testing::Test @@ -42,10 +43,10 @@ struct iterator_fixture : public ::testing::Test seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{{26,134},{105,152},{166,27},{152,191},{97,111},{134,242}}; + result_t expected_range{6790, 27032, 42523, 39103, 24943, 34546}; - decltype(seqan3::views::hybridstrobe(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 5)) test_range = - seqan3::views::hybridstrobe(vec, 1, 5); + decltype(seqan3::views::hybridstrobe(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 5, 4)) test_range = + seqan3::views::hybridstrobe(vec, 1, 5, 4); }; template <> @@ -56,11 +57,11 @@ struct iterator_fixture : public ::testing::Test seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{{26,152,27},{105,166,134},{166,97,111},{152,27,252},{97,27,252}}; + result_t expected_range{1742875, 6923910, 10903919, 9968636, 6364156}; decltype(seqan3::detail::hybridstrobe_view - (seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 3)) test_range = - seqan3::detail::hybridstrobe_view(vec, 1, 3); + (seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 3, 4)) test_range = + seqan3::detail::hybridstrobe_view(vec, 1, 3, 4); }; using test_types = ::testing::Types; @@ -81,7 +82,7 @@ class hybridstrobe_test : public ::testing::Test { protected: std::vector text1{"AAAAAAAAAAAAA"_dna4}; - result_t result1{{0,0},{0,0},{0,0},{0,0},{0,0}}; // Same result for ungapped and gapped + result_t result1{0,0,0,0,0}; // Same result for ungapped and gapped std::vector text3{"ACGGCGACGTTTAG"_dna4}; // kmers: ACGG, CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG @@ -93,14 +94,14 @@ class hybridstrobe_test : public ::testing::Test // stop at T gapped hybridstrobes: A--GC--C // start at A ungapped hybridstrobes: GCGAACGT, CGACACGT, GACGCGTT // start at A gapped hybridstrobes: G--AA--T, C--CA--T, G--GC--T - result_t result3_ungapped{{26,134},{105,152},{166,27},{152,191},{97,111},{134,242}}; - result_t result3_gapped{{2,10},{5,3},{10,3},{8,11},{5,12},{10,11}}; - result_t result3_ungapped_stop{{26,134}}; - result_t result3_gapped_stop{{2,10}}; - result_t result3_ungapped_start{{152,191},{97,111},{134,242}}; - result_t result3_gapped_start{{8,11},{5,12},{10,11}}; + result_t result3_ungapped{6790, 27032, 42523, 39103, 24943, 34546}; + result_t result3_gapped{42, 83, 163, 139, 92, 171}; + result_t result3_ungapped_stop{6790}; + result_t result3_gapped_stop{42}; + result_t result3_ungapped_start{39103, 24943, 34546}; + result_t result3_gapped_start{139, 92, 171}; - result_t result3_1{{0,0,0},{0,0,0},{0,0,0},{0,0,0}}; // Same result for ungapped and gapped + result_t result3_1{0,0,0,0}; // Same result for ungapped and gapped // ACGGCGACGTTTAG // kmers: ACGG, CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG @@ -110,8 +111,8 @@ class hybridstrobe_test : public ::testing::Test // gapped hybridstrobes: A--GC--CA--T, C--CC--CA--T, G--GC--CA--T, G--AA--TC--T, C--CA--TG--T // start at A ungapped hybridstrobes: GGCGCGACACGT, GCGAACGTCGTT, CGACACGTCGTT // start at A gapped hybridstrobes: G--GC--CA--T, G--AA--TC--T, C--CA--TG--T - result_t order_3_ungapped{{26,152,27},{105,166,134},{166,97,111},{152,27,252},{97,27,252}}; - result_t order_3_gapped{{2,8,3},{5,5,7},{10,5,7},{8,3,12},{5,7,14}}; + result_t order_3_ungapped{1742875, 6923910, 10903919, 9968636, 6364156}; + result_t order_3_gapped{643, 1367, 2647, 2108, 1406}; }; template @@ -135,7 +136,7 @@ TYPED_TEST(hybridstrobe_view_properties_test, concepts) auto v = text | kmer_view | hybridstrobe_view; compare_types(v); - auto v2 = seqan3::detail::hybridstrobe_view(text | kmer_view,1,3); + auto v2 = seqan3::detail::hybridstrobe_view(text | kmer_view,1,3,4); compare_types(v2); } @@ -143,15 +144,15 @@ TYPED_TEST(hybridstrobe_view_properties_test, different_inputs_kmer_hash) { TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG - result_t ungapped{{27,109},{109,97},{182,111},{216,97},{97,111},{134,242}}; - result_t gapped{{3,5},{5,3},{10,3},{12,5},{5,12},{10,11}}; + result_t ungapped{7021, 28001, 46703, 55393, 24943, 34546}; + result_t gapped{53, 83, 163, 197, 92, 171}; EXPECT_RANGE_EQ(ungapped, text | kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | hybridstrobe_view); + EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | hybridstrobe_view_gapped); - result_t ungapped3{{27,109,97},{109,216,27},{182,134,191},{216,97,111},{97,27,252}}; - result_t gapped3{{3,5,5},{5,5,7},{10,5,7},{12,5,7},{5,7,14}}; - EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::hybridstrobe_view(text | kmer_view,1,3))); - EXPECT_RANGE_EQ(gapped3, (seqan3::detail::hybridstrobe_view(text | gapped_kmer_view,1,3))); + result_t ungapped3{1797473, 7198747, 11962047, 14180719, 6364156}; + result_t gapped3{853, 1367, 2647, 3159, 1406}; + EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::hybridstrobe_view(text | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(gapped3, (seqan3::detail::hybridstrobe_view(text | gapped_kmer_view,1,3,2))); } TEST_F(hybridstrobe_test, ungapped_kmer_hash) @@ -159,24 +160,24 @@ TEST_F(hybridstrobe_test, ungapped_kmer_hash) EXPECT_RANGE_EQ(result1, text1 | kmer_view | hybridstrobe_view); EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(result3_1, (seqan3::detail::hybridstrobe_view(text1 | kmer_view,1,3))); - EXPECT_RANGE_EQ(order_3_ungapped, (seqan3::detail::hybridstrobe_view(text3 | kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_1, (seqan3::detail::hybridstrobe_view(text1 | kmer_view,1,3, 4))); + EXPECT_RANGE_EQ(order_3_ungapped, (seqan3::detail::hybridstrobe_view(text3 | kmer_view,1,3, 4))); } TEST_F(hybridstrobe_test, gapped_kmer_hash) { - EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | hybridstrobe_view); + EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | hybridstrobe_view_gapped); + EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | hybridstrobe_view_gapped); - EXPECT_RANGE_EQ(result3_1, (seqan3::detail::hybridstrobe_view(text1 | gapped_kmer_view,1,3))); - EXPECT_RANGE_EQ(order_3_gapped, (seqan3::detail::hybridstrobe_view(text3 | gapped_kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_1, (seqan3::detail::hybridstrobe_view(text1 | gapped_kmer_view,1,3,2))); + EXPECT_RANGE_EQ(order_3_gapped, (seqan3::detail::hybridstrobe_view(text3 | gapped_kmer_view,1,3,2))); } TEST_F(hybridstrobe_test, combinability) { auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | kmer_view | hybridstrobe_view); - EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | hybridstrobe_view); + EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | hybridstrobe_view_gapped); // This test leads to a compile error, I believe because underlying range is not sized, as I am not planing to use take_while, I leave it as it is. #Todo /*auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); diff --git a/test/api/randstrobe_test.cpp b/test/api/randstrobe_test.cpp index 6175a4f..5a3dc29 100644 --- a/test/api/randstrobe_test.cpp +++ b/test/api/randstrobe_test.cpp @@ -19,12 +19,13 @@ using seqan3::operator""_dna4; using seqan3::operator""_shape; -using result_t = std::vector>; +using result_t = std::vector; inline static constexpr auto kmer_view = seqan3::views::kmer_hash(seqan3::ungapped{4}); inline static constexpr auto gapped_kmer_view = seqan3::views::kmer_hash(0b1001_shape); -inline static constexpr auto randstrobe_view = seqan3::views::randstrobe(2,4); +inline static constexpr auto randstrobe_view = seqan3::views::randstrobe(2,4,4); +inline static constexpr auto randstrobe_view_gapped = seqan3::views::randstrobe(2,4,2); using iterator_type = std::ranges::iterator_t< decltype(std::declval() | kmer_view @@ -32,7 +33,7 @@ using iterator_type = std::ranges::iterator_t< decltype(std::declval() | kmer_view),3> - {std::declval() | kmer_view, 1, 3})>; + {std::declval() | kmer_view, 1, 3, 4})>; template <> struct iterator_fixture : public ::testing::Test @@ -42,10 +43,10 @@ struct iterator_fixture : public ::testing::Test seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{{26,97},{105,152},{166,111},{152,191},{97,252},{134,191}}; + result_t expected_range{6753, 27032, 42607, 39103, 25084, 34495}; - decltype(seqan3::views::randstrobe(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 2, 4)) test_range = - seqan3::views::randstrobe(vec, 2, 4); + decltype(seqan3::views::randstrobe(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 2, 4, 4)) test_range = + seqan3::views::randstrobe(vec, 2, 4, 4); }; template <> @@ -56,11 +57,11 @@ struct iterator_fixture : public ::testing::Test seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{{26,166,134},{105,152,27},{166,97,27},{152,134,252},{97,27,252}}; + result_t expected_range{1746566, 6920219, 10903835, 9996028, 6364156}; decltype(seqan3::detail::randstrobe_view - (seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 3)) test_range = - seqan3::detail::randstrobe_view(vec, 1, 3); + (seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 1, 3, 4)) test_range = + seqan3::detail::randstrobe_view(vec, 1, 3, 4); }; using test_types = ::testing::Types; @@ -81,29 +82,29 @@ class randstrobe_test : public ::testing::Test { protected: std::vector text1{"AAAAAAAAAAAA"_dna4}; - result_t result1{{0,0},{0,0},{0,0},{0,0}}; // Same result for ungapped and gapped + result_t result1{0,0,0,0}; // Same result for ungapped and gapped std::vector text3{"ACGGCGACGTTTAG"_dna4}; // kmers: ACGG, CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG // ungapped Hashes: 26, 105, 166, 152, 97, 134, 27, 111, 191, 252, 242 // gapped Hashes: 2, 5, 10, 8, 5, 10, 3, 7, 11, 12, 14 - result_t result3_ungapped{{26,97},{105,152},{166,111},{152,191},{97,252},{134,191}}; - result_t result3_gapped{{2,5},{5,3},{10,7},{8,11},{5,12},{10,7}}; - result_t result3_ungapped_stop{{26,97}}; - result_t result3_gapped_stop{{2,5}}; - result_t result3_ungapped_start{{152,191},{97,252},{134,191}}; - result_t result3_gapped_start{{8,11},{5,12},{10,7}}; + result_t result3_ungapped{6753, 27032, 42607, 39103, 25084, 34495}; + result_t result3_gapped{37, 83, 167, 139, 92, 167}; + result_t result3_ungapped_stop{6753}; + result_t result3_gapped_stop{37}; + result_t result3_ungapped_start{39103, 25084, 34495}; + result_t result3_gapped_start{139, 92, 167}; - result_t result3_1{{0,0,0},{0,0,0},{0,0,0}}; // Same result for ungapped and gapped + result_t result3_1{0,0,0}; // Same result for ungapped and gapped // ACGGCGACGTTTAG // kmers: ACGG, CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG // ungapped Hashes: 26, 105, 166, 152, 97, 134, 27, 111, 191, 252, 242 // gapped Hashes: 2, 5, 10, 8, 5, 10, 3, 7, 11, 12, 14 - result_t result3_3_ungapped{{26,166,134},{105,152,27},{166,97,27},{152,134,252},{97,27,252}}; - result_t result3_3_gapped{{2,5,10},{5,5,7},{10,8,3},{8,10,7},{5,3,11}}; - result_t result3_3_ungapped_start{{152,134,252},{97,27,252}}; - result_t result3_3_gapped_start{{8,10,7},{5,3,11}}; + result_t result3_3_ungapped{1746566, 6920219, 10903835, 9996028, 6364156}; + result_t result3_3_gapped{602, 1367, 2691, 2215, 1339}; + result_t result3_3_ungapped_start{9996028, 6364156}; + result_t result3_3_gapped_start{2215, 1339}; }; template @@ -127,7 +128,7 @@ TYPED_TEST(randstrobe_view_properties_test, concepts) auto v = text | kmer_view | randstrobe_view; compare_types(v); - auto v2 = seqan3::detail::randstrobe_view(text | kmer_view,1,3); + auto v2 = seqan3::detail::randstrobe_view(text | kmer_view,1,3,4); compare_types(v2); } @@ -138,15 +139,15 @@ TYPED_TEST(randstrobe_view_properties_test, different_inputs_kmer_hash) // kmers: ACGT, CGTC, GTCG, TCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG // ungapped Hashes: 27, 109, 182, 216, 97, 134, 27, 111, 191, 252, 242 // gapped Hashes: 3, 5, 10, 12, 5, 10, 3, 7, 11, 12, 14 - result_t ungapped{{27,97},{109,216},{182,97},{216,111},{97,252},{134,191}}; - result_t gapped{{3,5},{5,12},{10,7},{12,7},{5,12},{10,7}}; + result_t ungapped{7009, 28120, 46689, 55407, 25084, 34495}; + result_t gapped{53, 92, 167, 199, 92, 167}; EXPECT_RANGE_EQ(ungapped, text | kmer_view | randstrobe_view); - EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | randstrobe_view); + EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | randstrobe_view_gapped); - result_t ungapped3{{27,182,134},{109,216,27},{182,97,27},{216,134,252},{97,27,252}}; - result_t gapped3{{3,5,10},{5,12,3},{10,10,3},{12,5,7},{5,3,11}}; - EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::randstrobe_view(text | kmer_view,1,3))); - EXPECT_RANGE_EQ(gapped3, (seqan3::detail::randstrobe_view(text | gapped_kmer_view,1,3))); + result_t ungapped3{1816198, 7198747, 11952411, 14190332, 6364156}; + result_t gapped3{858, 1475, 2723, 3159, 1339}; + EXPECT_RANGE_EQ(ungapped3, (seqan3::detail::randstrobe_view(text | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(gapped3, (seqan3::detail::randstrobe_view(text | gapped_kmer_view,1,3,2))); } TEST_F(randstrobe_test, ungapped_kmer_hash) @@ -154,27 +155,27 @@ TEST_F(randstrobe_test, ungapped_kmer_hash) EXPECT_RANGE_EQ(result1, text1 | kmer_view | randstrobe_view); EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | randstrobe_view); - EXPECT_RANGE_EQ(result3_1, (seqan3::detail::randstrobe_view(text1 | kmer_view,1,3))); - EXPECT_RANGE_EQ(result3_3_ungapped, (seqan3::detail::randstrobe_view(text3 | kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_1, (seqan3::detail::randstrobe_view(text1 | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(result3_3_ungapped, (seqan3::detail::randstrobe_view(text3 | kmer_view,1,3,4))); } TEST_F(randstrobe_test, gapped_kmer_hash) { - EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | randstrobe_view); - EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | randstrobe_view); + EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | randstrobe_view_gapped); + EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | randstrobe_view_gapped); - EXPECT_RANGE_EQ(result3_1, (seqan3::detail::randstrobe_view(text1 | gapped_kmer_view,1,3))); - EXPECT_RANGE_EQ(result3_3_gapped, (seqan3::detail::randstrobe_view(text3 | gapped_kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_1, (seqan3::detail::randstrobe_view(text1 | gapped_kmer_view,1,3,2))); + EXPECT_RANGE_EQ(result3_3_gapped, (seqan3::detail::randstrobe_view(text3 | gapped_kmer_view,1,3,2))); } TEST_F(randstrobe_test, combinability) { auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | kmer_view | randstrobe_view); - EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | randstrobe_view); + EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_kmer_view | randstrobe_view_gapped); - EXPECT_RANGE_EQ(result3_3_ungapped_start, (seqan3::detail::randstrobe_view(text3 | start_at_a | kmer_view,1,3))); - EXPECT_RANGE_EQ(result3_3_gapped_start, (seqan3::detail::randstrobe_view(text3 | start_at_a | gapped_kmer_view,1,3))); + EXPECT_RANGE_EQ(result3_3_ungapped_start, (seqan3::detail::randstrobe_view(text3 | start_at_a | kmer_view,1,3,4))); + EXPECT_RANGE_EQ(result3_3_gapped_start, (seqan3::detail::randstrobe_view(text3 | start_at_a | gapped_kmer_view,1,3,2))); /*auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); EXPECT_RANGE_EQ(result3_ungapped_stop, text3 | stop_at_t | kmer_view | randstrobe_view); From 4c2ee1d0cdf3c22e27d3937c216fa0628af6b374 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Wed, 25 Jan 2023 18:17:20 +0100 Subject: [PATCH 10/34] Add syncmer to speed. --- include/syncmer.hpp | 4 +-- include/syncmer_hash.hpp | 52 +++++++++++++++++++++++++++++++++ src/compare.cpp | 10 +++---- src/main.cpp | 7 +++-- test/cli/minions_speed_test.cpp | 4 +-- 5 files changed, 65 insertions(+), 12 deletions(-) diff --git a/include/syncmer.hpp b/include/syncmer.hpp index 039c588..efb4c94 100644 --- a/include/syncmer.hpp +++ b/include/syncmer.hpp @@ -510,12 +510,12 @@ class syncmer_view::basic_iterator if constexpr (second_range_is_given) window_values2.push_front(*urng3_iterator); - auto smallest_s_it = std::ranges::min_element(window_values, std::less{}); + auto smallest_s_it = std::ranges::min_element(window_values, std::less{}); syncmer_position_offset = std::distance(std::begin(window_values), smallest_s_it); if constexpr (second_range_is_given) { - smallest_s_it = std::ranges::min_element(window_values2, std::less{}); + smallest_s_it = std::ranges::min_element(window_values2, std::less{}); syncmer_position_offset2 = std::distance(std::begin(window_values2), smallest_s_it); if (*urng2_iterator < *urng4_iterator) diff --git a/include/syncmer_hash.hpp b/include/syncmer_hash.hpp index cabebbf..394daf5 100644 --- a/include/syncmer_hash.hpp +++ b/include/syncmer_hash.hpp @@ -109,6 +109,57 @@ struct syncmer_hash_fn } }; +struct syncmer_hash_no_reverse_fn +{ + /*!\brief Store the kmers and the smers and return a range adaptor closure object. + * \param[in] kmers The k-mer size to be used. + * \param[in] smers The s-mer size (s const pos) const + { + return seqan3::detail::adaptor_from_functor{*this, smers, kmers, pos}; + } + + /*!\brief Call the view's constructor with the underlying view, a k-mer size and a s-mer size as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and + * the reference type of the range must model seqan3::semialphabet. + * \param[in] kmers The k-mer size to be used. + * \param[in] smers The s-mer size (s + auto operator()(urng_t && urange, + size_t const smers, + size_t const kmers, + std::vector const pos) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::syncmer_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::syncmer_hash must model std::ranges::forward_range."); + static_assert(semialphabet>, + "The range parameter to views::syncmer_hash must be over elements of seqan3::semialphabet."); + + if (smers < 1 || kmers <= smers) + throw std::invalid_argument{"The chosen kmers and smers are not valid." + "Please choose values greater than 1 and a smer size smaller than the kmer size."}; + + auto forward_strand = std::forward(urange) + | seqan3::views::kmer_hash(seqan3::shape(seqan3::ungapped(kmers))); + + auto forward_strand_smer = std::forward(urange) + | seqan3::views::kmer_hash(seqan3::shape(seqan3::ungapped(smers))); + + return seqan3::detail::syncmer_view + (forward_strand_smer, forward_strand, kmers - smers + 1, pos); + } +}; + } // namespace seqan3::detail /*!\name Alphabet related views @@ -156,4 +207,5 @@ struct syncmer_hash_fn * */ inline constexpr auto syncmer_hash = seqan3::detail::syncmer_hash_fn{}; +inline constexpr auto syncmer_hash_no_reverse = seqan3::detail::syncmer_hash_no_reverse_fn{}; //!\} diff --git a/src/compare.cpp b/src/compare.cpp index 44a6908..b36d67e 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -717,13 +717,11 @@ void do_speed(std::vector sequence_files, range_arguments { case kmer: speed(sequence_files, seqan3::views::kmer_hash(args.shape), create_name(args), args); break; - case minimiser: speed(sequence_files, seqan3::views::minimiser_hash(args.shape, - args.w_size, args.seed_se), create_name(args), args); + case minimiser: speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::minimiser(args.w_size.get()-args.shape.size()+1), create_name(args), args); break; - case modmers: speed(sequence_files, modmer_hash(args.shape, - args.w_size.get(), args.seed_se), create_name(args), args); + case modmers: speed(sequence_files, seqan3::views::kmer_hash(args.shape) | modmer(args.w_size.get()), create_name(args), args); break; - case strobemer: std::ranges::empty_view empty{}; + case strobemer: {std::ranges::empty_view empty{}; if (args.lib_implementation) { if (args.rand & (args.order == 2)) @@ -750,5 +748,7 @@ void do_speed(std::vector sequence_files, range_arguments else if (args.rand & (args.order == 3)) speed(sequence_files, seqan3::views::kmer_hash(args.shape) | seqan3::views::randstrobe(true, args.w_min + args.shape.size() - 1, args.w_max - args.shape.size() + 1, args.shape.count()), create_name(args), args); } + break;} + case syncmer: speed(sequence_files, syncmer_hash_no_reverse(args.w_size.get(), args.k_size, args.positions), create_name(args), args); } } diff --git a/src/main.cpp b/src/main.cpp index 28cd96c..585bd56 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -43,12 +43,12 @@ void read_range_arguments_strobemers(seqan3::argument_parser & parser, range_arg void read_range_arguments_syncmers(seqan3::argument_parser & parser, range_arguments & args) { parser.add_option(args.positions, 'p', "pos", "The positions that determine, if a submer is a syncmer."); - parser.add_option(args.t, 't', "t_vlue", "The offset for the position of the smallest sub-window."); } void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_arguments & args) { - parser.add_option(w_size, 'w', "window", "Define window size. Default: 60."); + parser.add_option(w_size, 'w', "window", "Define window size for minimiser. For syncmers, use this parameter for " + "the s-mer size, which should be smaller than the k-mer size in that case. Default: 60."); parser.add_option(shape, '\0', "shape", "Define a shape by the decimal of a bitvector, where 0 symbolizes a " "position to be ignored, 1 a position considered. Default: ungapped."); parser.add_option(se, '\0', "seed", "Define seed."); @@ -222,11 +222,12 @@ int speed(seqan3::argument_parser & parser) all_arguments(parser, args); std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", - seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); + seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer","syncmer"}); parser.add_flag(args.lib_implementation, '\0', "original", "Set, if you want to use the strobemer implementation from Sahlin."); read_range_arguments_minimiser(parser, args); read_range_arguments_strobemers(parser, args); + read_range_arguments_syncmers(parser, args); try { diff --git a/test/cli/minions_speed_test.cpp b/test/cli/minions_speed_test.cpp index cc94aed..526c486 100644 --- a/test/cli/minions_speed_test.cpp +++ b/test/cli/minions_speed_test.cpp @@ -24,7 +24,7 @@ TEST_F(cli_test, with_argument) TEST_F(cli_test, minimiser) { - cli_test_result result = execute_app("minions speed --method minimiser -k 19 -w 19 ", data("example1.fasta")); + cli_test_result result = execute_app("minions speed --method minimiser -k 19 -w 23 ", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{}); @@ -68,7 +68,7 @@ TEST_F(cli_test, wrong_method) std::string expected { "Error. Incorrect command line input for speed. Validation failed " - "for option --method: Value submer is not one of [kmer,minimiser,modmer,strobemer].\n" + "for option --method: Value submer is not one of [kmer,minimiser,modmer,strobemer,syncmer].\n" }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); From 1a4d2a8897cbb1f806ec2ec3ebe9e270b2fb1580 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Thu, 26 Jan 2023 14:15:44 +0100 Subject: [PATCH 11/34] Correct shapes. --- src/snakemake/speed/Snakefile | 6 +++--- src/snakemake/speed/plot_speed.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index f277b21..c165ea0 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -20,10 +20,10 @@ rule plot: [shape + "_kmer_hash_20_speed.out" for shape in ["0", "933855", "975475"]], [shape + "_kmer_hash_22_speed.out" for shape in ["0", "4192891", "3669089"]], [shape + "_kmer_hash_24_speed.out" for shape in ["0", "14548847", "13954519"]], - [shape + "_kmer_hash_26_speed.out" for shape in ["0", "14351359", "16503191"]], - [shape + "_kmer_hash_28_speed.out" for shape in ["0", "3758077695", "241004285"]], + [shape + "_kmer_hash_26_speed.out" for shape in ["0", "62257151", "66560815"]], + [shape + "_kmer_hash_28_speed.out" for shape in ["0", "234879855", "241004285"]], [shape + "_kmer_hash_30_speed.out" for shape in ["0", "805287931", "1004529051"]], - [shape + "_kmer_hash_32_speed.out" for shape in ["0", "3169577727", "241004285"]], + [shape + "_kmer_hash_32_speed.out" for shape in ["0", "3169577727", "3856068575"]], # 4 "gaps" ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 3d83bd7..52eb2d9 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -18,9 +18,9 @@ def read_file(results, files): # Read all files kmers = read_file([], ["0_kmer_hash_"+str(k)+"_speed.out" for k in k_size]) -shapes4 = ["36607","933855","14548847","3758077695","3169577727"] +shapes4 = ["36607","933855","14548847","234879855","3169577727"] gapped4_kmers = read_file([], [shapes4[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) -shapes8 = ["51755","975475","13954519","241004285","241004285"] +shapes8 = ["51755","975475","13954519","241004285","3856068575"] gapped8_kmers = read_file([], [shapes8[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) minstrobemers = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) hybridstrobemers = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) From 06491e2353b25eab53cc20aa230dbce9de3b28b1 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Thu, 26 Jan 2023 14:21:43 +0100 Subject: [PATCH 12/34] Save output files in results. --- src/snakemake/speed/Snakefile | 24 ++++++++++++------------ src/snakemake/speed/plot_speed.py | 12 ++++++------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index c165ea0..4313d13 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -1,19 +1,19 @@ rule all: - input: "Speed_all.png", - "Speed_all8.png", - "Speed_kmers.png", - "Speed_strobemers.png", - "Speed_strobemers4.png", - "Speed_strobemers8.png" + input: "../results/Speed_all.png", + "../results/Speed_all8.png", + "../results/Speed_kmers.png", + "../results/Speed_strobemers.png", + "../results/Speed_strobemers4.png", + "../results/Speed_strobemers8.png" rule plot: output: - "Speed_all.png", - "Speed_all8.png", - "Speed_kmers.png", - "Speed_strobemers.png", - "Speed_strobemers4.png", - "Speed_strobemers8.png" + "../results/Speed_all.png", + "../results/Speed_all8.png", + "../results/Speed_kmers.png", + "../results/Speed_strobemers.png", + "../results/Speed_strobemers4.png", + "../results/Speed_strobemers8.png" input: [shape + "_kmer_hash_16_speed.out" for shape in ["0", "36607", "51755"]], [shape + "_kmer_hash_18_speed.out" for shape in ["0","233469", "246365"]], diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 52eb2d9..ece6010 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -50,7 +50,7 @@ def read_file(results, files): plt.fill_between(pos, [x[0]-x[1] for x in gapped8_kmers], [x[0]+x[1] for x in gapped8_kmers], color = colors_error[2], alpha=0.7) plt.legend(title="Number of gaps") -plt.savefig("Speed_kmers.png") +plt.savefig("../results/Speed_kmers.png") # Plot comparison between strobemers 4 gaps fig = plt.figure() @@ -70,7 +70,7 @@ def read_file(results, files): plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) #plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") -plt.savefig("Speed_strobemers4.png") +plt.savefig("../results/Speed_strobemers4.png") # Plot comparison between strobemers 8 gaps fig = plt.figure() @@ -90,7 +90,7 @@ def read_file(results, files): plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[2], alpha=0.7) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("Speed_strobemers8.png", bbox_inches='tight') +plt.savefig("../results/Speed_strobemers8.png", bbox_inches='tight') # Plot comparison between all fig = plt.figure() @@ -109,7 +109,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("Speed_all.png",bbox_inches='tight') +plt.savefig("../results/Speed_all.png",bbox_inches='tight') # Plot comparison between all with 8 fig = plt.figure() @@ -128,7 +128,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='randstrobemers',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("Speed_all8.png",bbox_inches='tight') +plt.savefig("../results/Speed_all8.png",bbox_inches='tight') # Plot comparison between strobemers all gaps @@ -173,4 +173,4 @@ def read_file(results, files): plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("Speed_strobemers.png", bbox_inches='tight') +plt.savefig("../results/Speed_strobemers.png", bbox_inches='tight') From f9c0caf35baec43e52bb68987f42c232af0a102c Mon Sep 17 00:00:00 2001 From: mitradarja Date: Thu, 26 Jan 2023 15:39:51 +0100 Subject: [PATCH 13/34] Fix typos. --- src/snakemake/speed/Snakefile | 2 +- src/snakemake/speed/plot_speed.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index 4313d13..b551357 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -37,7 +37,7 @@ rule plot: ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], ["hybridstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in [9,12,15]] + ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in [9,12,15]], # 4 "gaps" ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index ece6010..b945796 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -6,6 +6,7 @@ k_size = [16,20,24,28,32] pos = [x+0.25 for x in range(len(k_size))] +strobe_range = [int(k/2) for k in k_size] def read_file(results, files): for file in files: From e76b26cfff51682f6a12352194093de65b061a99 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Wed, 1 Feb 2023 16:21:48 +0100 Subject: [PATCH 14/34] Speed for reprentative methods, some doc. --- .gitignore | 1 + README.md | 33 +++- src/compare.cpp | 2 +- src/snakemake/speed/README.md | 2 +- src/snakemake/speed/Snakefile | 54 +++++- src/snakemake/speed/plot_speed.py | 287 +++++++++++++++++++++++++----- 6 files changed, 320 insertions(+), 59 deletions(-) diff --git a/.gitignore b/.gitignore index d7a2930..641d323 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ src/snakemake/ !src/snakemake/speed/README !src/snakemake/speed/Snakefile !src/snakemake/speed/plot_speed.py +!src/snakemake/speed/plot_speed_representative.py diff --git a/README.md b/README.md index c048636..b763bca 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Currently, the following methods are supported: - strobemers (integrated as submodule from [here](https://github.com/ksahlin/strobemers) and implemented as a view) - syncmers -See Issue #1 for a list of methods that will be added in the future. +See Issue #1 for a list of methods that will be added in the future (see down below here for an example usage of each method). The following evaluation metrics are implemented at the moment (see an example usage for each metric down below): @@ -34,7 +34,7 @@ make test # Speed -Speeds creates a file called `{method}_speed.out` and returns the speed of processing a singular sequence in microseconds. As typical one sequence file contains multiple sequences the minimum speed, the mean, the variance and the maximum speed are returned. Speed can also handle multiple files and calculate the mean over all sequences found in all files. +Speeds creates a file called `{method}_speed.out` and returns the speed of processing a singular sequence in microseconds. As typical one sequence file contains multiple sequences the minimum speed, the mean, the variance and the maximum speed are returned. Speed can also handle multiple files and calculate the mean over all sequences found in all files. Speed considers for all supported methods the non-canonical version. Example usage for calculating the k-mers of a given input file `in.fa`: ``` @@ -49,10 +49,35 @@ kmer_hash_16 10 11.478 0.970317 21 -1590685541 The first number is the minimum, then follows the mean, the variance and the maximum. The last number in the row can be ignored as it's only used for internal purposes. **Note:** -Currently, there are two implementations of the strobemers supported. The original one from [Kristoffer Sahlin](https://github.com/ksahlin/strobemers) and the one here presented. The one here presented is more comparable to the other methods used here, because they are based on the same hash functions. Therefore, these strobemers are used for almost every evaluation metric. However, currently the implementation is slower than the one from Sahlin, that is why both implementations can be used with speed. +Currently, speed supports two implementation of the strobemers. The original one from [Kristoffer Sahlin](https://github.com/ksahlin/strobemers) and the one here presented. The one here presented is more comparable to the other methods used here, because they are based on the same hash functions. Therefore, these strobemers are used for every other evaluation metric. For the original implementation, add the flag `--original` and note that for the original implementation, only randstrobemers are supported for order 2 and 3, minstrobemers and hybridstrobemers only support order 2. Furthermore, the flags `--w-min` and `--w-max` have different meanings between the original implementation and the implementation here. `w-min` in the implementation from minions is the distance between the first strobe to second strobe. While for the original implementation, it is the starting position in the sequence of the window that is considered for the second strobe. Therefore, the call with original should always add (k+1) to `w-min` compared to the minion implementation. -`w-max` in the implementation from minions is the window length that should be considered for every strobe besides the first one. All strobes need to be completely inside this window length to be considered. While for the original implementation, it is the position in the sequence until which a strobe that is considered has to end. Therefore, for a strobemer with a strobe length of 8, `w-min` of 0 and `w-max` of 15 in the minion implementation would equal a `w-min` of 9 and `w-max` of 24. For more details, please read the documentation for both implementations. +`w-max` in the implementation from minions is the window length that should be considered for every strobe besides the first one. All strobes need to be completely inside this window length to be considered. While for the original implementation, it is the position in the sequence until which a strobe that is considered has to start. Therefore, for a strobemer with a strobe length of 8, `w-min` of 0 and `w-max` of 15 in the minion implementation would equal a `w-min` of 9 and `w-max` of 17. For more details, please read the documentation for both implementations. + +# Methods + +If a metric supports a method, pick it with the flag `--method`. + +## k-mers + +k-mers are defined by their value of k, which can be defined with `-k`. A gapped k-mer can be used by defining a shape with `--shape`. Shape expects a number, this number should be the decimal representation of a binary number with a starting and an ending 1, each 0 in the binary number will be considered a gap. + +## strobemers + +Currently, minstrobemers with the flag `--min`, hybridstrobmers with the flag `--hybrid` and randstrobmers with the flag `--rand` are supported for order 2 and 3, which can be defined with `--order`. +With `-k` the length of a single strobe can be defined and with `--w-min` the distance between the first strobe to the next one and with `--w-max` the length of the window to pick the second and third strobe. + +## minimizers + +Minimizers support ungapped and gapped k-mers. A window size can be given with `-w`. The randomization of the order is achieved by XOR all k-mer hash values with a seed, if the lexicographical order is wanted `--seed` should be set to 0. For more information, see the [seqan tutorial](http://docs.seqan.de/seqan/3-master-user/tutorial_minimiser.html). + +## modmers + +Minimizers support ungapped and gapped k-mers. The mod value can be given with `-w`. The randomization of the order is achieved by XOR all k-mer hash values with a seed, if the lexicographical order is wanted `--seed` should be set to 0. + +## syncmers + +Syncmers support ungapped. The s-mer value can be given with `-w`. The positions of a s-mer that make a k-mer a syncmer can be given with `-p`. The randomization of the order is achieved by XOR all k-mer hash values with a seed, if the lexicographical order is wanted `--seed` should be set to 0. diff --git a/src/compare.cpp b/src/compare.cpp index b36d67e..81b3548 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -596,7 +596,7 @@ std::string create_name(range_arguments & args) return ""; } } - case syncmer: return "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(*args.positions.begin()) + "_" + std::to_string(*args.positions.end()); + case syncmer: return "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); default: return ""; diff --git a/src/snakemake/speed/README.md b/src/snakemake/speed/README.md index f560307..34620ed 100644 --- a/src/snakemake/speed/README.md +++ b/src/snakemake/speed/README.md @@ -1,6 +1,6 @@ # Simulated Data -The file "../results/simulated_10000.fa.gz" contains 1,000 random sequences of length 10,000 bp and was created via {mason](https://www.seqan.de/apps/mason.html) with the following command: +The file "../results/simulated_10000.fa.gz" contains 1,000 random sequences of length 10,000 bp and was created via [mason](https://www.seqan.de/apps/mason.html) with the following command: ``` mason_genome -o simulated_10000.fa $(for i in {1..100}; do echo -l 10000; done;) -s 42 diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index b551357..1aa8cd9 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -4,7 +4,10 @@ rule all: "../results/Speed_kmers.png", "../results/Speed_strobemers.png", "../results/Speed_strobemers4.png", - "../results/Speed_strobemers8.png" + "../results/Speed_strobemers8.png", + "../results/Speed_randstrobemers_original.png", + "../results/Speed_randstrobemers_original_order3.png", + "../results/Speed_representative.png" rule plot: output: @@ -13,7 +16,9 @@ rule plot: "../results/Speed_kmers.png", "../results/Speed_strobemers.png", "../results/Speed_strobemers4.png", - "../results/Speed_strobemers8.png" + "../results/Speed_strobemers8.png", + "../results/Speed_randstrobemers_original.png", + "../results/Speed_randstrobemers_original_order3.png" input: [shape + "_kmer_hash_16_speed.out" for shape in ["0", "36607", "51755"]], [shape + "_kmer_hash_18_speed.out" for shape in ["0","233469", "246365"]], @@ -32,12 +37,12 @@ rule plot: ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [9,12,15]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(7+k)+"_speed.out" for k in [9,12,15]], + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [9,12,15]], # 4 "gaps" ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], @@ -50,6 +55,23 @@ rule plot: ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]] shell: "python3 plot_speed.py" +rule plot_representative: + output: + "../results/Speed_representative.png" + input: + # minimiser + ["0_minimiser_hash_20_"+str(w)+"_speed.out" for w in [i for i in range(24,44,4)]], + ["0_minimiser_hash_"+str(k)+"_40_speed.out" for k in [i for i in range(16,36,4)]], + # modmer + ["0_modmer_hash_20_"+str(w)+"_speed.out" for w in [3,5,7,9,11]], + ["0_modmer_hash_"+str(k)+"_7_speed.out" for k in [i for i in range(16,36,4)]], + # syncmer + ["syncmer_hash_20_"+str(w)+"_0_0_speed.out" for w in [18,16,14,12,10]], + ["syncmer_hash_"+str(k)+"_10_0_0_speed.out" for k in [i for i in range(22,12,-2)]], + ["syncmer_hash_20_"+str(w)+"_0_6_speed.out" for w in [15,11,7,3,1]], + ["syncmer_hash_"+str(k)+"_3_0_6_speed.out" for k in [i for i in range(28,8,-4)]] + shell: "python3 plot_speed_representative.py" + rule download_example_Data: output: "../results/simulated_10000.fa.gz" @@ -111,3 +133,19 @@ rule speed_randstrobemer_original: "Original_randstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_speed.out" shell: "minions speed --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --original --order {wildcards.order} {input} -o Original_" + +rule speed_minimiser_modmer: + input: + "../results/simulated_10000.fa.gz" + output: + "0_{method}_hash_{kmer_size}_{w_size}_speed.out" + shell: + "minions speed --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape 0 -o 0_ {input}" + +rule speed_syncmer: + input: + "../results/simulated_10000.fa.gz" + output: + "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_speed.out" + shell: + "minions speed --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input}" diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index b945796..d6260a5 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -4,9 +4,15 @@ import matplotlib.pyplot as plt import numpy as np -k_size = [16,20,24,28,32] +#k_size = [16,20,24,28,32] +#pos = [x+0.25 for x in range(len(k_size))] +#strobe_range = [int(k/2) for k in k_size] +k_size = [16,18,20,22,24,26,28,30,32] pos = [x+0.25 for x in range(len(k_size))] -strobe_range = [int(k/2) for k in k_size] +pos_order3 = [1.25,4.25,7.25] +k_order3 = [9,12,15] +k_size_order3 = [i*2 for i in k_order3] +strobe_range = [k for k in range(8,17)] def read_file(results, files): for file in files: @@ -19,18 +25,29 @@ def read_file(results, files): # Read all files kmers = read_file([], ["0_kmer_hash_"+str(k)+"_speed.out" for k in k_size]) -shapes4 = ["36607","933855","14548847","234879855","3169577727"] +shapes4 = ["36607","233469","933855","4192891","14548847","62257151","234879855","805287931","3169577727"] gapped4_kmers = read_file([], [shapes4[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) -shapes8 = ["51755","975475","13954519","241004285","3856068575"] +shapes8 = ["51755","246365","975475","3669089","13954519","66560815","241004285","1004529051","3856068575"] gapped8_kmers = read_file([], [shapes8[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) -minstrobemers = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) -randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [9,12,15]]) -minstrobemers8 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers8 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]]) + +kmers_order3 = read_file([], ["0_kmer_hash_"+str(k)+"_speed.out" for k in k_size_order3]) +shapes4_order3 = ["233469","14548847","805287931"] +gapped4_order3 = read_file([], [shapes4_order3[i] + "_kmer_hash_"+str(k_size_order3[i])+"_speed.out" for i in range(len(k_order3))]) +shapes8_order3 = ["246365","13954519","1004529051"] +gapped8_order3 = read_file([], [shapes8_order3[i] + "_kmer_hash_"+str(k_size_order3[i])+"_speed.out" for i in range(len(k_order3))]) + +minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in strobe_range]) +minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in k_order3]) +hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_speed.out" for k in k_order3]) +randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in strobe_range]) +randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in k_order3]) +minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) +minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) +hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) +randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) +randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) # Plot comparison between k-mers fig = plt.figure() @@ -62,12 +79,12 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in minstrobemers], color = colors[0], label='minstrobemers') -plt.plot(pos, [x[0] for x in hybridstrobemers], color = colors[1], label='hybridstrobemers') +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[0], label='minstrobemers') +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[1], label='hybridstrobemers') plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='randstrobemers') -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers], [x[0]+x[1] for x in minstrobemers], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers], [x[0]+x[1] for x in hybridstrobemers], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[1], alpha=0.7) plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) #plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") @@ -82,17 +99,57 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in minstrobemers8], color = colors[0], label='minstrobemers') -plt.plot(pos, [x[0] for x in hybridstrobemers8], color = colors[1], label='hybridstrobemers') +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[0], label='minstrobemers') +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[1], label='hybridstrobemers') plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[2], label='randstrobemers') -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers8], [x[0]+x[1] for x in minstrobemers8], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers8], [x[0]+x[1] for x in hybridstrobemers8], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[1], alpha=0.7) plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[2], alpha=0.7) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_strobemers8.png", bbox_inches='tight') +# Plot comparison between strobemers 4 gaps order 3 +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1"] +colors_error = ["#748beb","#e47585","#b261c2"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[0], label='minstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[1], label='hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='randstrobemers') + +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers3], [x[0]+x[1] for x in minstrobemers3], color = colors_error[0], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers3], [x[0]+x[1] for x in hybridstrobemers3], color = colors_error[1], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) + +#plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers4_order3.png") + +# Plot comparison between strobemers 8 gaps order 3 +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1"] +colors_error = ["#748beb","#e47585","#b261c2"] +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[0], label='minstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[1], label='hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[2], label='randstrobemers') + +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers38], [x[0]+x[1] for x in minstrobemers38], color = colors_error[0], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers38], [x[0]+x[1] for x in hybridstrobemers38], color = colors_error[1], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[2], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers8_order3.png", bbox_inches='tight') + # Plot comparison between all fig = plt.figure() X = np.arange(len(k_size)) @@ -105,8 +162,8 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) -plt.plot(pos, [x[0] for x in minstrobemers], color = colors[3], label='minstrobemers',linewidth=3.0) -plt.plot(pos, [x[0] for x in hybridstrobemers], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") @@ -124,29 +181,52 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) -plt.plot(pos, [x[0] for x in minstrobemers8], color = colors[3], label='minstrobemers',linewidth=3.0) -plt.plot(pos, [x[0] for x in hybridstrobemers8], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='hybridstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='randstrobemers',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all8.png",bbox_inches='tight') +# Plot comparison between all order 3 +fig = plt.figure() +X = np.arange(len(k_size_order3)) -# Plot comparison between strobemers all gaps -k_size = [16,18,20,22,24,26,28,30,32] -pos = [x+0.25 for x in range(len(k_size))] -pos_order3 = [1.25,4.25,7.25] -strobe_range = [k for k in range(8,17)] +colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] +plt.xlabel("k") +plt.xticks(pos_order3, k_size_order3) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos_order3, [x[0] for x in kmers_order3], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped4_order3], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped8_order3], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[5], label='randstrobemers',linewidth=3.0) + +#plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_all_order3.png",bbox_inches='tight') + +# Plot comparison between all with 8 order 3 +fig = plt.figure() +X = np.arange(len(k_size_order3)) + +colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] +plt.xlabel("k") +plt.xticks(pos_order3, k_size_order3) +plt.ylabel("Speed in microseconds") # in microseconds -minstrobemers = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) -randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [9,12,15]]) -minstrobemers8 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers8 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]]) +plt.plot(pos_order3, [x[0] for x in kmers_order3], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped4_order3], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped8_order3], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='randstrobemers',linewidth=3.0) +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_all8_order3.png",bbox_inches='tight') + +# Plot comparison between strobemers all gaps fig = plt.figure() X = np.arange(len(k_size)) colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] @@ -155,23 +235,140 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in minstrobemers], color = colors[0], label='4 minstrobemers') -plt.plot(pos, [x[0] for x in hybridstrobemers], color = colors[1], label='4 hybridstrobemers') +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[0], label='4 minstrobemers') +plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[0], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[1], label='4 hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[1], label='4 randstrobemers3',linestyle="dashed") plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='4 randstrobemers') plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='4 randstrobemers3',linestyle="dashed") -plt.plot(pos, [x[0] for x in minstrobemers8], color = colors[3], label='8 minstrobemers') -plt.plot(pos, [x[0] for x in hybridstrobemers8], color = colors[4], label='8 hybridstrobemers') +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[3], label='8 minstrobemers') +plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='8 hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='4 randstrobemers3',linestyle="dashed") plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='8 randstrobemers') plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='8 randstrobemers3',linestyle="dashed") -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers], [x[0]+x[1] for x in minstrobemers], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers], [x[0]+x[1] for x in hybridstrobemers], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[0], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers3], [x[0]+x[1] for x in minstrobemers3], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[1], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers3], [x[0]+x[1] for x in hybridstrobemers3], color = colors_error[1], alpha=0.7) plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers8], [x[0]+x[1] for x in minstrobemers8], color = colors_error[3], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers8], [x[0]+x[1] for x in hybridstrobemers8], color = colors_error[4], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[3], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers38], [x[0]+x[1] for x in minstrobemers38], color = colors_error[3], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[4], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers38], [x[0]+x[1] for x in hybridstrobemers38], color = colors_error[4], alpha=0.7) plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[5], alpha=0.7) plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_strobemers.png", bbox_inches='tight') + + +# Plot comparison between strobemers all gaps +original_minstrobemers2 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +original_hybridstrobemers2 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in k_order3]) +original_minstrobemers28 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +original_hybridstrobemers28 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) + +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] +colors_error = ["#748beb","#e47585","#b261c2","#01d63a","#00e7e0","#fefea1"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in original_minstrobemers2], color = colors[0], label='4 minstrobemers') +plt.plot(pos, [x[0] for x in original_hybridstrobemers2], color = colors[1], label='4 hybridstrobemers') +plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors[2], label='4 randstrobemers') +plt.plot(pos, [x[0] for x in original_minstrobemers28], color = colors[3], label='8 minstrobemers') +plt.plot(pos, [x[0] for x in original_hybridstrobemers28], color = colors[4], label='8 hybridstrobemers') +plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors[5], label='8 randstrobemers') + +plt.fill_between(pos, [x[0]-x[1] for x in original_minstrobemers2], [x[0]+x[1] for x in original_minstrobemers2], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in original_hybridstrobemers2], [x[0]+x[1] for x in original_hybridstrobemers2], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in original_randstrobemers2], [x[0]+x[1] for x in original_randstrobemers2], color = colors_error[2], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in original_minstrobemers28], [x[0]+x[1] for x in original_minstrobemers28], color = colors_error[3], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in original_hybridstrobemers28], [x[0]+x[1] for x in original_hybridstrobemers28], color = colors_error[4], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in original_randstrobemers28], [x[0]+x[1] for x in original_randstrobemers28], color = colors_error[5], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers_original_all.png", bbox_inches='tight') + + +fig = plt.figure() +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[2], label='4') +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[5], label='8') +plt.plot(pos, [x[0] for x in original_minstrobemers2], color = colors[0], label='4 ori') +plt.plot(pos, [x[0] for x in original_minstrobemers28], color = colors[1], label='8 ori') + +#plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[2], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[5], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in original_minstrobemers2], [x[0]+x[1] for x in original_minstrobemers2], color = colors_error[0], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in original_minstrobemers28], [x[0]+x[1] for x in original_minstrobemers28], color = colors_error[1], alpha=0.7) + +plt.legend(title="Methods") +plt.savefig("../results/Speed_minstrobemers_original.png", bbox_inches='tight') + +fig = plt.figure() +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[2], label='4') +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[5], label='8') +plt.plot(pos, [x[0] for x in original_hybridstrobemers2], color = colors[0], label='4 ori') +plt.plot(pos, [x[0] for x in original_hybridstrobemers28], color = colors[1], label='8 ori') + +#plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[2], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[5], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in original_hybridstrobemers2], [x[0]+x[1] for x in original_hybridstrobemers2], color = colors_error[0], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in original_hybridstrobemers28], [x[0]+x[1] for x in original_hybridstrobemers28], color = colors_error[1], alpha=0.7) + +plt.legend(title="Methods") +plt.savefig("../results/Speed_hybridstrobemers_original.png", bbox_inches='tight') + +fig = plt.figure() +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='4') +plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='8') +plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors[0], label='4 ori') +plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors[1], label='8 ori') + +#plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[5], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in original_randstrobemers2], [x[0]+x[1] for x in original_randstrobemers2], color = colors_error[0], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in original_randstrobemers28], [x[0]+x[1] for x in original_randstrobemers28], color = colors_error[1], alpha=0.7) + +plt.legend(title="Methods") +plt.savefig("../results/Speed_randstrobemers_original.png", bbox_inches='tight') + +fig = plt.figure() +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='4') +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='8') +plt.plot(pos_order3, [x[0] for x in original_randstrobemers3], color = colors[0], label='4 ori') +plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors[1], label='8 ori') + +#plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) +#plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) +#plt.fill_between(pos_order3, [x[0]-x[1] for x in original_randstrobemers3], [x[0]+x[1] for x in original_randstrobemers3], color = colors_error[0], alpha=0.7) +#plt.fill_between(pos_order3, [x[0]-x[1] for x in original_randstrobemers38], [x[0]+x[1] for x in original_randstrobemers38], color = colors_error[1], alpha=0.7) + +plt.legend(title="Methods") +plt.savefig("../results/Speed_randstrobemers_original_order3.png", bbox_inches='tight') From ba3fbc6a99822364a4f3a294e2db8456351b58d7 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 3 Feb 2023 16:33:32 +0100 Subject: [PATCH 15/34] Count enables representative methods for strobemers and some clean up. --- include/compare.h | 6 +- include/modmer.hpp | 2 - include/modmer_hash.hpp | 2 - lib/seqan3 | 2 +- src/compare.cpp | 150 +++++++++++++++------- src/main.cpp | 5 +- src/snakemake/genmap/genmap_uniqueness.py | 25 ++-- src/snakemake/speed/Snakefile | 32 ++--- src/snakemake/speed/plot_speed.py | 139 -------------------- 9 files changed, 149 insertions(+), 214 deletions(-) diff --git a/include/compare.h b/include/compare.h index bde205f..dbb1ccc 100644 --- a/include/compare.h +++ b/include/compare.h @@ -108,8 +108,9 @@ void store_ibf(IBFType const & ibf, /*! \brief Function that creates the string name of the used view. * \param args The arguments about the view to be used. + * \param args If true, "Strobmer" is added to the name. */ -std::string create_name(range_arguments & args); +std::string create_name(range_arguments & args, bool underlying_strobemer = false); /*! \brief Function, comparing the methods in regard of their coverage. * \param args The arguments about the view to be used. @@ -119,8 +120,9 @@ void do_accuracy(accuracy_arguments & args); /*! \brief Function, comparing the number of submers. * \param sequence_files A vector of sequence files. * \param args The arguments about the view to be used. + * \param underlying_strobemer True, if strobemers should be used with a representative method like minimizer. */ -void do_counts(std::vector sequence_files, range_arguments & args); +void do_counts(std::vector sequence_files, range_arguments & args, bool underlying_strobemer = false); /*! \brief Function, comparing the methods in regard of their distance. * \param sequence_file A sequence file. diff --git a/include/modmer.hpp b/include/modmer.hpp index b4cf1e6..1f2a56f 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -396,8 +396,6 @@ struct modmer_fn template constexpr auto operator()(urng1_t && urange1, size_t const mod_used) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::modmer cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer must model std::ranges::forward_range."); diff --git a/include/modmer_hash.hpp b/include/modmer_hash.hpp index 22f3f2a..1b34929 100644 --- a/include/modmer_hash.hpp +++ b/include/modmer_hash.hpp @@ -65,8 +65,6 @@ struct modmer_hash_fn uint32_t const mod_used, seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::modmer_hash cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer_hash must model std::ranges::forward_range."); static_assert(semialphabet>, diff --git a/lib/seqan3 b/lib/seqan3 index d29786b..9f83975 160000 --- a/lib/seqan3 +++ b/lib/seqan3 @@ -1 +1 @@ -Subproject commit d29786b61de73f14eed5c83c14ef7e02f038bdb1 +Subproject commit 9f83975b4a5dd4e73007b8040e5ba83647e72090 diff --git a/src/compare.cpp b/src/compare.cpp index 81b3548..9d11352 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -191,7 +191,7 @@ void accuracy(urng_t input_view, * \param method_name Name of the tested method. * \param args The arguments about the view to be used, needed for strobemers. */ -template +template void counts(std::vector sequence_files, urng_t input_view, std::string method_name, range_arguments & args) { std::vector counts_results{}; @@ -199,20 +199,20 @@ void counts(std::vector sequence_files, urng_t input_view for (int i = 0; i < sequence_files.size(); ++i) { robin_hood::unordered_node_map hash_table{}; - if constexpr (strobemers > 0) + seqan3::sequence_file_input> fin{sequence_files[i]}; + if constexpr (underlying_strobemer) // TODO: Fix this, so that syncmers for strobemers do not need extra code. { - seqan3::sequence_file_input> fin{sequence_files[i]}; for (auto & [seq] : fin) { - std::vector> strobes_vector; - get_strobemers(seq, args, strobes_vector); - for (auto & t : strobes_vector) // iterate over the strobemer tuples - hash_table[std::get<0>(t)] = std::min(65534u, hash_table[std::get<0>(t)] + 1); + for (auto && hash : seq | input_view) + { + if (syncmer_filter(hash,args.w_size.get(), (args.k_size *args.order), args.positions, args.seed_se.get())) + hash_table[hash] = std::min(65534u, hash_table[hash] + 1); + } } } else { - seqan3::sequence_file_input> fin{sequence_files[i]}; for (auto & [seq] : fin) { for (auto && hash : seq | input_view) @@ -555,15 +555,21 @@ void unique(std::vector input_files, std::filesystem::pat outfile.close(); } -std::string create_name(range_arguments & args) +std::string create_name(range_arguments & args, bool underlying_strobemer) { + std::string prefix{""}; + if (underlying_strobemer) + { + prefix = "Strobemer_"; + } + switch(args.name) { case kmer: return "kmer_hash_"+std::to_string(args.k_size); break; - case minimiser: return "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); + case minimiser: return prefix +"minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); break; - case modmers: return "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); + case modmers: return prefix +"modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); break; case strobemer: { std::ranges::empty_view empty{}; @@ -596,10 +602,9 @@ std::string create_name(range_arguments & args) return ""; } } - case syncmer: return "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); + case syncmer: return prefix + "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); default: return ""; - } } @@ -622,33 +627,92 @@ void do_accuracy(accuracy_arguments & args) } } -void do_counts(std::vector sequence_files, range_arguments & args) +void do_counts(std::vector sequence_files, range_arguments & args, bool underlying_strobemer) { - switch(args.name) + if(underlying_strobemer) { - case kmer: counts(sequence_files, seqan3::views::kmer_hash(args.shape), create_name(args), args); - break; - case minimiser: counts(sequence_files, seqan3::views::minimiser_hash(args.shape, - args.w_size, args.seed_se), create_name(args), args); - break; - case modmers: counts(sequence_files, modmer_hash(args.shape, - args.w_size.get(), args.seed_se), create_name(args), args); - break; - case syncmer: counts(sequence_files, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), - create_name(args), args); - break; - case strobemer: { - std::ranges::empty_view empty{}; - if (args.rand & (args.order == 2)) - counts, 1>(sequence_files, empty, create_name(args), args); - else if (args.rand & (args.order == 3)) - counts, 2>(sequence_files, empty, create_name(args), args); - else if (args.hybrid) - counts, 3>(sequence_files, empty, create_name(args), args); - else if (args.minstrobers) - counts, 4>(sequence_files, empty, create_name(args), args); + seqan3::debug_stream << args.shape.size(); + switch(args.name) + { + case minimiser: { + if (args.hybrid & (args.order == 2)) + counts(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + if (args.hybrid & (args.order == 3)) + counts(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + if (args.minstrobers & (args.order == 2)) + counts(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + if (args.minstrobers & (args.order == 3)) + counts(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + if (args.rand & (args.order == 2)) + counts(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + if (args.rand & (args.order == 3)) + counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + + } break; - } + case modmers: { + if (args.hybrid & (args.order == 2)) + counts(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + if (args.hybrid & (args.order == 3)) + counts(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + if (args.minstrobers & (args.order == 2)) + counts(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + if (args.minstrobers & (args.order == 3)) + counts(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + if (args.rand & (args.order == 2)) + counts(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + if (args.rand & (args.order == 3)) + counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + + } + break; + case syncmer: { + if (args.hybrid & (args.order == 2)) + counts),true>(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + if (args.hybrid & (args.order == 3)) + counts),true>(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + if (args.minstrobers & (args.order == 2)) + counts),true>(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + if (args.minstrobers & (args.order == 3)) + counts),true>(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + if (args.rand & (args.order == 2)) + counts),true>(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + if (args.rand & (args.order == 3)) + counts),true>(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + } + break; + } + } + else + { + switch(args.name) + { + case kmer: counts(sequence_files, seqan3::views::kmer_hash(args.shape), create_name(args), args); + break; + case minimiser: counts(sequence_files, seqan3::views::minimiser_hash(args.shape, + args.w_size, args.seed_se), create_name(args), args); + break; + case modmers: counts(sequence_files, modmer_hash(args.shape, + args.w_size.get(), args.seed_se), create_name(args), args); + break; + case syncmer: counts(sequence_files, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), + create_name(args), args); + break; + case strobemer: { + if (args.hybrid & (args.order == 2)) + counts(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.hybrid & (args.order == 3)) + counts(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.minstrobers & (args.order == 2)) + counts(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.minstrobers & (args.order == 3)) + counts(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.rand & (args.order == 2)) + counts(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.rand & (args.order == 3)) + counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + } + } } } @@ -695,17 +759,17 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen else { if (args.hybrid & (args.order == 2)) - match(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + match(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); else if (args.hybrid & (args.order == 3)) - match(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max),create_name(args), args); + match(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se),create_name(args), args); else if (args.minstrobers & (args.order == 2)) - match(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + match(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); else if (args.minstrobers & (args.order == 3)) - match(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + match(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); else if (args.rand & (args.order == 2)) - match(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + match(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); else if (args.rand & (args.order == 3)) - match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); } } } diff --git a/src/main.cpp b/src/main.cpp index 585bd56..1f15cf3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -117,10 +117,13 @@ int counts(seqan3::argument_parser & parser) { range_arguments args{}; std::vector sequence_files{}; + bool underlying_strobemer = false; parser.info.short_description = "Counts the number of submers in the given " "sequence files."; parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); + parser.add_flag(underlying_strobemer,'\0', "strobemer", "If strobemers should be used as base for representative " + "methods like minimizers. Default: False."); all_arguments(parser, args); std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", @@ -142,7 +145,7 @@ int counts(seqan3::argument_parser & parser) } string_to_methods(method, args.name); - do_counts(sequence_files, args); + do_counts(sequence_files, args, underlying_strobemer); return 0; } diff --git a/src/snakemake/genmap/genmap_uniqueness.py b/src/snakemake/genmap/genmap_uniqueness.py index 767dfe1..44feb1d 100644 --- a/src/snakemake/genmap/genmap_uniqueness.py +++ b/src/snakemake/genmap/genmap_uniqueness.py @@ -1,3 +1,4 @@ +import os import sys import numpy as np @@ -7,12 +8,18 @@ k_mers = [16,20,24,28,32] errors = [0,1,2] -def get_unique(in_file): +def get_unique(in_file,k): frequencies = np.fromfile(in_file, dtype=np.uint16) - number_elements = len (frequencies) + number_elements = len(frequencies) # Problem counts all elements that appear more than once multiple times number_unique = (frequencies == 1).sum() - # print(number_elements, number_unique, (number_unique*100.0)/number_elements) + file = "../count/0_minimiser_hash_"+str(k)+"_"+str(k)+"_counts.out" + if (os.path.exists(file)): # This is more accurate + with open(file, 'r') as f: + for line in f: + number_elements = round(float(line.split('\t')[2]),2) + + print(k, number_unique, number_elements, len(frequencies),os.path.exists(file)) return (number_unique*100.0)/number_elements def get_results(species): @@ -21,22 +28,22 @@ def get_results(species): results.append([]) for k in k_mers: for e in errors: - genmap_file = 'genmap/'+species+'_K_'+str(k)+'_E_'+str(e)+'.freq16' - results[errors.index(e)].append(get_unique(genmap_file)) + genmap_file = '../../../../genmap/build/genmap/'+species+'_K_'+str(k)+'_E_'+str(e)+'.freq16' + results[errors.index(e)].append(get_unique(genmap_file,k)) fig = plt.figure() X = np.arange(len(k_mers)) colors = ["#00ba32","#00d6e7","#fad100"] pos = [0.25,1.25,2.25,3.25,4.25] plt.xlabel("k") plt.xticks(pos, k_mers) - plt.ylabel("percentage of unique k-mers") + plt.ylabel("% of unique k-mers") plt.bar(X + 0.00, results[0], color = colors[0], width = 0.25, label='0') plt.bar(X + 0.25, results[1], color = colors[1], width = 0.25, label='1') plt.bar(X + 0.50, results[2], color = colors[2], width = 0.25, label='2') plt.legend(title="# of errors") - plt.savefig("Uniqueness_"+species+".png") + plt.savefig("../results/Uniqueness_"+species+".png") get_results("Human") -get_results("Mouse") -get_results("Wheat") +#get_results("Mouse") +#get_results("Wheat") diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index 1aa8cd9..23b0166 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -1,22 +1,24 @@ rule all: - input: "../results/Speed_all.png", - "../results/Speed_all8.png", - "../results/Speed_kmers.png", - "../results/Speed_strobemers.png", - "../results/Speed_strobemers4.png", - "../results/Speed_strobemers8.png", - "../results/Speed_randstrobemers_original.png", - "../results/Speed_randstrobemers_original_order3.png", - "../results/Speed_representative.png" + input: "../results/Speed_all.png", + "../results/Speed_all8.png", + "../results/Speed_all_order3.png", + "../results/Speed_all8_order3.png", + "../results/Speed_strobemers_original_all.png", + "../results/Speed_minstrobemers_original.png", + "../results/Speed_hybridstrobemers_original.png", + "../results/Speed_randstrobemers_original.png", + "../results/Speed_randstrobemers_original_order3.png" + "../results/Speed_representative.png" rule plot: output: "../results/Speed_all.png", "../results/Speed_all8.png", - "../results/Speed_kmers.png", - "../results/Speed_strobemers.png", - "../results/Speed_strobemers4.png", - "../results/Speed_strobemers8.png", + "../results/Speed_all_order3.png", + "../results/Speed_all8_order3.png", + "../results/Speed_strobemers_original_all.png", + "../results/Speed_minstrobemers_original.png", + "../results/Speed_hybridstrobemers_original.png", "../results/Speed_randstrobemers_original.png", "../results/Speed_randstrobemers_original_order3.png" input: @@ -140,7 +142,7 @@ rule speed_minimiser_modmer: output: "0_{method}_hash_{kmer_size}_{w_size}_speed.out" shell: - "minions speed --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape 0 -o 0_ {input}" + "minions speed --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} -o 0_ {input}" rule speed_syncmer: input: @@ -148,4 +150,4 @@ rule speed_syncmer: output: "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_speed.out" shell: - "minions speed --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input}" + "minions speed --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} {input}" diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index d6260a5..8081cab 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -49,107 +49,6 @@ def read_file(results, files): randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) -# Plot comparison between k-mers -fig = plt.figure() -X = np.arange(len(k_size)) -colors = ["#00ba32","#00d6e7","#fad100"] -colors_error = ["#01d63a","#00e7e0","#fefea1"] -plt.xlabel("k") -plt.xticks(pos, k_size) -plt.ylabel("Speed in microseconds") # in microseconds - -plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='0') -plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4') -plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8') - - -plt.fill_between(pos, [x[0]-x[1] for x in kmers], [x[0]+x[1] for x in kmers], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in gapped4_kmers], [x[0]+x[1] for x in gapped4_kmers], color = colors_error[1], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in gapped8_kmers], [x[0]+x[1] for x in gapped8_kmers], color = colors_error[2], alpha=0.7) - -plt.legend(title="Number of gaps") -plt.savefig("../results/Speed_kmers.png") - -# Plot comparison between strobemers 4 gaps -fig = plt.figure() -X = np.arange(len(k_size)) -colors = ["#697ed5","#c76674","#9350a1"] -colors_error = ["#748beb","#e47585","#b261c2"] -plt.xlabel("k") -plt.xticks(pos, k_size) -plt.ylabel("Speed in microseconds") # in microseconds - -plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[0], label='minstrobemers') -plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[1], label='hybridstrobemers') -plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='randstrobemers') - -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[1], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) - -#plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") -plt.savefig("../results/Speed_strobemers4.png") - -# Plot comparison between strobemers 8 gaps -fig = plt.figure() -X = np.arange(len(k_size)) -colors = ["#697ed5","#c76674","#9350a1"] -colors_error = ["#748beb","#e47585","#b261c2"] -plt.xlabel("k") -plt.xticks(pos, k_size) -plt.ylabel("Speed in microseconds") # in microseconds - -plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[0], label='minstrobemers') -plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[1], label='hybridstrobemers') -plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[2], label='randstrobemers') - -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[1], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[2], alpha=0.7) - -plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("../results/Speed_strobemers8.png", bbox_inches='tight') - -# Plot comparison between strobemers 4 gaps order 3 -fig = plt.figure() -X = np.arange(len(k_size)) -colors = ["#697ed5","#c76674","#9350a1"] -colors_error = ["#748beb","#e47585","#b261c2"] -plt.xlabel("k") -plt.xticks(pos, k_size) -plt.ylabel("Speed in microseconds") # in microseconds - -plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[0], label='minstrobemers') -plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[1], label='hybridstrobemers') -plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='randstrobemers') - -plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers3], [x[0]+x[1] for x in minstrobemers3], color = colors_error[0], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers3], [x[0]+x[1] for x in hybridstrobemers3], color = colors_error[1], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) - -#plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") -plt.savefig("../results/Speed_strobemers4_order3.png") - -# Plot comparison between strobemers 8 gaps order 3 -fig = plt.figure() -X = np.arange(len(k_size)) -colors = ["#697ed5","#c76674","#9350a1"] -colors_error = ["#748beb","#e47585","#b261c2"] -plt.xlabel("k") -plt.xticks(pos_order3, k_order3) -plt.ylabel("Speed in microseconds") # in microseconds - -plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[0], label='minstrobemers') -plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[1], label='hybridstrobemers') -plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[2], label='randstrobemers') - -plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers38], [x[0]+x[1] for x in minstrobemers38], color = colors_error[0], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers38], [x[0]+x[1] for x in hybridstrobemers38], color = colors_error[1], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[2], alpha=0.7) - -plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("../results/Speed_strobemers8_order3.png", bbox_inches='tight') - # Plot comparison between all fig = plt.figure() X = np.arange(len(k_size)) @@ -226,44 +125,6 @@ def read_file(results, files): plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all8_order3.png",bbox_inches='tight') -# Plot comparison between strobemers all gaps -fig = plt.figure() -X = np.arange(len(k_size)) -colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] -colors_error = ["#748beb","#e47585","#b261c2","#01d63a","#00e7e0","#fefea1"] -plt.xlabel("k") -plt.xticks(pos, k_size) -plt.ylabel("Speed in microseconds") # in microseconds - -plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[0], label='4 minstrobemers') -plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[0], label='4 randstrobemers3',linestyle="dashed") -plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[1], label='4 hybridstrobemers') -plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[1], label='4 randstrobemers3',linestyle="dashed") -plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='4 randstrobemers') -plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='4 randstrobemers3',linestyle="dashed") -plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[3], label='8 minstrobemers') -plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='4 randstrobemers3',linestyle="dashed") -plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='8 hybridstrobemers') -plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='4 randstrobemers3',linestyle="dashed") -plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='8 randstrobemers') -plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='8 randstrobemers3',linestyle="dashed") - -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[0], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers3], [x[0]+x[1] for x in minstrobemers3], color = colors_error[0], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[1], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers3], [x[0]+x[1] for x in hybridstrobemers3], color = colors_error[1], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[3], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers38], [x[0]+x[1] for x in minstrobemers38], color = colors_error[3], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[4], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers38], [x[0]+x[1] for x in hybridstrobemers38], color = colors_error[4], alpha=0.7) -plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[5], alpha=0.7) -plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) - -plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") -plt.savefig("../results/Speed_strobemers.png", bbox_inches='tight') - # Plot comparison between strobemers all gaps original_minstrobemers2 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) From 5488d6238515591e25fd0e8a522614ccdb8bed38 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 6 Feb 2023 15:39:38 +0100 Subject: [PATCH 16/34] Count for strobemers with representative methods. --- src/compare.cpp | 98 +++++++++++++++++++------------ test/api/minstrobe_hash_test.cpp | 9 ++- test/api/randstrobe_hash_test.cpp | 4 ++ test/cli/minions_counts_test.cpp | 8 +++ 4 files changed, 81 insertions(+), 38 deletions(-) diff --git a/src/compare.cpp b/src/compare.cpp index 9d11352..9bd884e 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -191,8 +191,8 @@ void accuracy(urng_t input_view, * \param method_name Name of the tested method. * \param args The arguments about the view to be used, needed for strobemers. */ -template -void counts(std::vector sequence_files, urng_t input_view, std::string method_name, range_arguments & args) +template +void counts(std::vector & sequence_files, urng_t input_view, std::string method_name, range_arguments & args) { std::vector counts_results{}; std::ofstream outfile; @@ -200,25 +200,50 @@ void counts(std::vector sequence_files, urng_t input_view { robin_hood::unordered_node_map hash_table{}; seqan3::sequence_file_input> fin{sequence_files[i]}; - if constexpr (underlying_strobemer) // TODO: Fix this, so that syncmers for strobemers do not need extra code. + for (auto & [seq] : fin) { - for (auto & [seq] : fin) - { - for (auto && hash : seq | input_view) - { - if (syncmer_filter(hash,args.w_size.get(), (args.k_size *args.order), args.positions, args.seed_se.get())) - hash_table[hash] = std::min(65534u, hash_table[hash] + 1); - } - } + for (auto && hash : seq | input_view) + hash_table[hash] = std::min(65534u, hash_table[hash] + 1); } - else + counts_results.push_back(hash_table.size()); + + // Store representative k-mers + outfile.open(std::string{args.path_out} + method_name + "_"+ std::string{sequence_files[i].stem()} + "_counts.out", std::ios::binary); + for (auto && hash : hash_table) + { + outfile.write(reinterpret_cast(&hash.first), sizeof(hash.first)); + outfile.write(reinterpret_cast(&hash.second), sizeof(hash.second)); + } + outfile.close(); + } + + double mean_counts, stdev_counts; + get_mean_and_var(counts_results, mean_counts, stdev_counts); + + // Store speed and counts + outfile.open(std::string{args.path_out} + method_name + "_counts.out"); + outfile << method_name << "\t" << *std::min_element(counts_results.begin(), counts_results.end()) << "\t" << mean_counts << "\t" << stdev_counts << "\t" << *std::max_element(counts_results.begin(), counts_results.end()) << "\n"; + outfile.close(); +} + +template +void counts_strobemer(std::vector & sequence_files, urng_t input_view, urng_t2 input_view2, std::string method_name, range_arguments & args) +{ + std::vector counts_results{}; + std::ofstream outfile; + for (int i = 0; i < sequence_files.size(); ++i) + { + robin_hood::unordered_node_map hash_table{}; + seqan3::sequence_file_input> fin{sequence_files[i]}; + for (auto & [seq] : fin) { - for (auto & [seq] : fin) + std::vector res = seq | input_view; + for (auto && hash : res | input_view2) { - for (auto && hash : seq | input_view) - hash_table[hash] = std::min(65534u, hash_table[hash] + 1); + hash_table[hash] = std::min(65534u, hash_table[hash] + 1); } } + counts_results.push_back(hash_table.size()); // Store representative k-mers @@ -631,56 +656,55 @@ void do_counts(std::vector sequence_files, range_argument { if(underlying_strobemer) { - seqan3::debug_stream << args.shape.size(); switch(args.name) { case minimiser: { if (args.hybrid & (args.order == 2)) - counts(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + counts_strobemer(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); if (args.hybrid & (args.order == 3)) - counts(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + counts_strobemer(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); if (args.minstrobers & (args.order == 2)) - counts(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + counts_strobemer(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); if (args.minstrobers & (args.order == 3)) - counts(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + counts_strobemer(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); if (args.rand & (args.order == 2)) - counts(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + counts_strobemer(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); if (args.rand & (args.order == 3)) - counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); - + counts_strobemer(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); } break; case modmers: { if (args.hybrid & (args.order == 2)) - counts(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + counts_strobemer(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), modmer(args.w_size.get()), create_name(args, true), args); if (args.hybrid & (args.order == 3)) - counts(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + counts_strobemer(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), modmer(args.w_size.get()), create_name(args, true), args); if (args.minstrobers & (args.order == 2)) - counts(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + counts_strobemer(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), modmer(args.w_size.get()), create_name(args, true), args); if (args.minstrobers & (args.order == 3)) - counts(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + counts_strobemer(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), modmer(args.w_size.get()), create_name(args, true), args); if (args.rand & (args.order == 2)) - counts(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); + counts_strobemer(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), modmer(args.w_size.get()), create_name(args, true), args); if (args.rand & (args.order == 3)) - counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to | modmer(args.w_size.get()), create_name(args, true), args); - + counts_strobemer(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), modmer(args.w_size.get()), create_name(args, true), args); } break; case syncmer: { + auto syncmer = std::views::filter([args] (uint64_t i) + {return syncmer_filter(i,args.w_size.get(), (args.k_size *args.order), args.positions, args.seed_se.get());}); if (args.hybrid & (args.order == 2)) - counts),true>(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + counts_strobemer(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), syncmer, create_name(args, true), args); if (args.hybrid & (args.order == 3)) - counts),true>(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + counts_strobemer(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), syncmer, create_name(args, true), args); if (args.minstrobers & (args.order == 2)) - counts),true>(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + counts_strobemer(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), syncmer, create_name(args, true), args); if (args.minstrobers & (args.order == 3)) - counts),true>(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + counts_strobemer(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), syncmer, create_name(args, true), args); if (args.rand & (args.order == 2)) - counts),true>(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + counts_strobemer(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), syncmer, create_name(args, true), args); if (args.rand & (args.order == 3)) - counts),true>(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se) | seqan3::views::to, create_name(args,true), args); + counts_strobemer(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), syncmer, create_name(args, true), args); } - break; + break; } } else diff --git a/test/api/minstrobe_hash_test.cpp b/test/api/minstrobe_hash_test.cpp index 25a168b..19ce9f2 100644 --- a/test/api/minstrobe_hash_test.cpp +++ b/test/api/minstrobe_hash_test.cpp @@ -8,12 +8,14 @@ #include #include #include +#include #include #include #include "minstrobe_hash.hpp" +#include "modmer.hpp" using seqan3::operator""_dna4; using seqan3::operator""_shape; @@ -58,6 +60,8 @@ class minstrobe_hash_test : public ::testing::Test result_t result3_gapped{35, 37, 21, 135, 67}; result_t result3_ungapped_start{39023, 25023}; result_t result3_gapped_start{135, 67}; + result_t result3_minimiser{1637,357,25023}; + result_t result3_modmer{357,25023}; std::vector text1_3{"AAAAAAAAAAAAAAAA"_dna4}; result_t result3_1{0}; // Same result for ungapped and gapped @@ -105,7 +109,7 @@ TEST_F(minstrobe_hash_test, ungapped) EXPECT_RANGE_EQ(result3_1, text1_3 | ungapped3_view); EXPECT_RANGE_EQ(result3_3_ungapped, text3_3 | ungapped3_view); - //EXPECT_NO_THROW((text1 | minstrobe3_hash(ungapped_shape,3,6))); // Todo: Fix, Do I want to throw when sequence not long enough? + //EXPECT_NO_THROW((text1 | minstrobe3_hash(ungapped_shape,3,6))); // Todo: Fix, Do I want to throw when sequence not long enough? EXPECT_THROW((text3 | minstrobe3_hash(ungapped_shape,3,2)), std::invalid_argument); } @@ -127,4 +131,7 @@ TEST_F(minstrobe_hash_test, combinability) auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | ungapped_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_view); + + EXPECT_RANGE_EQ(result3_minimiser, text3 | ungapped_view | seqan3::views::minimiser(2)); + EXPECT_RANGE_EQ(result3_modmer, text3 | ungapped_view | modmer(3)); } diff --git a/test/api/randstrobe_hash_test.cpp b/test/api/randstrobe_hash_test.cpp index ef7d69b..325689d 100644 --- a/test/api/randstrobe_hash_test.cpp +++ b/test/api/randstrobe_hash_test.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -58,6 +59,7 @@ class randstrobe_hash_test : public ::testing::Test result_t result3_gapped{35, 37, 21, 139, 67}; result_t result3_ungapped_start{39103, 25074}; result_t result3_gapped_start{139, 67}; + result_t result3_minimiser{1637,357,25074}; std::vector text1_3{"AAAAAAAAAAAAAAAA"_dna4}; result_t result3_1{0}; // Same result for ungapped and gapped @@ -127,4 +129,6 @@ TEST_F(randstrobe_hash_test, combinability) auto start_at_a = std::views::drop(3); EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | ungapped_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_view); + + EXPECT_RANGE_EQ(result3_minimiser, text3 | ungapped_view | seqan3::views::minimiser(2)); } diff --git a/test/cli/minions_counts_test.cpp b/test/cli/minions_counts_test.cpp index 7af25d4..40418f6 100644 --- a/test/cli/minions_counts_test.cpp +++ b/test/cli/minions_counts_test.cpp @@ -62,6 +62,14 @@ TEST_F(cli_test, minstrobemer) EXPECT_EQ(result.err, std::string{}); } +TEST_F(cli_test, minstrobemer_minimiser) +{ + cli_test_result result = execute_app("minions counts --method minimiser -w 3 --strobemer -k 19 --w-min 16 --w-max 30 --order 2 --min", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + TEST_F(cli_test,closedsyncmer) { cli_test_result result = execute_app("minions counts --method syncmer --pos 0 --pos 16 -k 19 -w 3", data("example1.fasta")); From 17027339a617a297b8c50beb756f7acfcd95d59e Mon Sep 17 00:00:00 2001 From: mitradarja Date: Wed, 8 Feb 2023 16:46:55 +0100 Subject: [PATCH 17/34] Match for strobmers and representative methods. --- .gitignore | 2 + include/compare.h | 3 +- src/compare.cpp | 258 ++++++++++++++++++++++++-------- src/main.cpp | 8 +- test/cli/minions_match_test.cpp | 10 +- 5 files changed, 216 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index 641d323..975a331 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,8 @@ src/snakemake/ !src/snakemake/genmap/genmap_uniqueness.py !src/snakemake/accuracy/add_errors.py +!src/snakemake/accuracy/README +!src/snakemake/accuracy/Snakefile !src/snakemake/speed/README !src/snakemake/speed/Snakefile diff --git a/include/compare.h b/include/compare.h index dbb1ccc..e11c051 100644 --- a/include/compare.h +++ b/include/compare.h @@ -134,8 +134,9 @@ void do_distance(std::filesystem::path sequence_file, range_arguments & args); * \param sequence_file1 The first sequence file. * \param sequence_file2 The second sequence file. * \param args The arguments about the view to be used. + * \param underlying_strobemer True, if strobemers should be used with a representative method like minimizer. */ -void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, range_arguments & args); +void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, range_arguments & args, bool underlying_strobemer = false); /*! \brief Function, comparing the speed. * \param sequence_files A vector of sequence files. diff --git a/src/compare.cpp b/src/compare.cpp index 9bd884e..1df095c 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -293,14 +293,40 @@ void compare_cov2(std::filesystem::path sequence_file, urng_t distance_view, std outfile.close(); } -template -std::vector read_seq_file(std::filesystem::path sequence_file, urng_t input_view) +template +std::vector read_seq_file(std::filesystem::path sequence_file, urng_t input_view, range_arguments & args) +{ + std::vector vector{}; + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [seq] : fin) + { + if constexpr (syncmer) + { + for (auto && hash : seq | input_view) + { + if (syncmer_filter(hash, args.w_size.get(), (args.k_size *args.order), args.positions, args.seed_se.get())) + vector.push_back(hash); + } + } + else + { + for (auto && hash : seq | input_view) + vector.push_back(hash); + } + } + + return vector; +} + +template +std::vector read_seq_file(std::filesystem::path sequence_file, urng_t input_view, urng_t2 input_view2) { std::vector vector{}; seqan3::sequence_file_input> fin{sequence_file}; for (auto & [seq] : fin) { - for (auto && hash : seq | input_view) + auto v = seq | input_view; + for (auto && hash : v | input_view2) vector.push_back(hash); } @@ -350,8 +376,8 @@ void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_ } else { - seq1_vector = read_seq_file(sequence_file1, input_view); - seq2_vector = read_seq_file(sequence_file2, input_view); + seq1_vector = read_seq_file(sequence_file1, input_view, args); + seq2_vector = read_seq_file(sequence_file2, input_view, args); } std::size_t length{0}; @@ -408,24 +434,23 @@ void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_ std::cout << "Expected Island Size: " << get_expected(islands) << "\n"; } -/*! \brief Count the number of matches and match coverage found in two sequence files. - * \param sequence_file1 The first sequence file. - * \param sequence_file2 The second sequence file. - * \param input_view View that should be tested. - * \param compare_view View for comparison, should be kmer_hash view. +/*! \brief Do the actual matching for repesentative methods. + * \param seq1_vector The first vector based on the first file. + * \param seq2_vector The second vector based on the second file. + * \param all1_vector The first vector containig all submers (kmers or strobmers) of the first file. + * \param all2_vector The second vector containig all submers (kmers or strobmers) of the second file. * \param method_name Name of the tested method. * \param args The arguments about the view to be used, needed for strobemers. */ -template -void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, urng_t input_view, urng2_t compare_view, std::string method_name, range_arguments & args) +void match_vectors(std::vector & seq1_vector, + std::vector & seq2_vector, + std::vector & all1_vector, + std::vector & all2_vector, + std::string method_name, + range_arguments & args) { uint64_t matches{0}; uint64_t missed{0}; - std::vector seq1_vector = read_seq_file(sequence_file1, input_view); - std::vector seq2_vector = read_seq_file(sequence_file2, input_view); - std::vector all1_vector = read_seq_file(sequence_file1, compare_view); - std::vector all2_vector = read_seq_file(sequence_file2, compare_view); - int it_1{0}; int it_2{0}; bool changed{true}; @@ -447,12 +472,7 @@ void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_ if (current_island > 0) new_island = true; - switch(args.name) - { - case minimiser: fill_positions(positions, i, args.w_size.get()); - break; - case modmers: fill_positions(positions, i, args.k_size); - } + fill_positions(positions, i, args.k_size); } else if ((seq1_vector[it_1] == all1_vector[i]) & (seq2_vector[it_2] == all2_vector[i]) & changed) { @@ -492,6 +512,63 @@ void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_ std::cout << "Expected Island Size: " << get_expected(islands) << "\n"; } +/*! \brief Count the number of matches and match coverage found in two sequence files. + * \param sequence_file1 The first sequence file. + * \param sequence_file2 The second sequence file. + * \param input_view View that should be tested. + * \param compare_view View for comparison, should be kmer_hash view. + * \param method_name Name of the tested method. + * \param args The arguments about the view to be used, needed for strobemers. + */ +template +void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, urng_t input_view, urng2_t compare_view, std::string method_name, range_arguments & args) +{ + std::vector seq1_vector = read_seq_file(sequence_file1, input_view, args); + std::vector seq2_vector = read_seq_file(sequence_file2, input_view, args); + std::vector all1_vector = read_seq_file(sequence_file1, compare_view, args); + std::vector all2_vector = read_seq_file(sequence_file2, compare_view, args); + + match_vectors(seq1_vector, seq2_vector, all1_vector, all2_vector, method_name, args); +} + +/*! \brief Count the number of matches and match coverage found in two sequence files. + * \param sequence_file1 The first sequence file. + * \param sequence_file2 The second sequence file. + * \param input_view View that should be tested. + * \param compare_view View for comparison, should be kmer_hash view. + * \param method_name Name of the tested method. + * \param args The arguments about the view to be used, needed for strobemers. + */ +template +void match_strobemer(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, urng_t input_view, urng2_t compare_view, std::string method_name, range_arguments & args) +{ + std::vector seq1_vector = read_seq_file(sequence_file1, compare_view, input_view); + std::vector seq2_vector = read_seq_file(sequence_file2, compare_view, input_view); + std::vector all1_vector = read_seq_file(sequence_file1, compare_view, args); + std::vector all2_vector = read_seq_file(sequence_file2, compare_view, args); + + match_vectors(seq1_vector, seq2_vector, all1_vector, all2_vector, method_name, args); +} + +/*! \brief Count the number of matches and match coverage found in two sequence files for syncmers bases on strobemers. + * \param sequence_file1 The first sequence file. + * \param sequence_file2 The second sequence file. + * \param input_view View that should be tested. + * \param compare_view View for comparison, should be kmer_hash view. + * \param method_name Name of the tested method. + * \param args The arguments about the view to be used, needed for strobemers. + */ +template +void match_syncmer(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, urng_t input_view, std::string method_name, range_arguments & args) +{ + std::vector seq1_vector = read_seq_file(sequence_file1, input_view, args); + std::vector seq2_vector = read_seq_file(sequence_file2, input_view, args); + std::vector all1_vector = read_seq_file(sequence_file1, input_view, args); + std::vector all2_vector = read_seq_file(sequence_file2, input_view, args); + + match_vectors(seq1_vector, seq2_vector, all1_vector, all2_vector, method_name, args); +} + /*! \brief Function, that measures the speed of a method. * \param sequence_files A vector of sequence files. * \param input_view View that should be tested. @@ -753,48 +830,107 @@ void do_distance(std::filesystem::path sequence_file, range_arguments & args) } } -void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, range_arguments & args) +void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequence_file2, range_arguments & args, bool underlying_strobemer) { seqan3::seed seed{args.seed_se}; - switch(args.name) + + if(underlying_strobemer) { - case kmer: match(sequence_file1, sequence_file2, seqan3::views::kmer_hash(args.shape), create_name(args), args); - break; - case minimiser: match(sequence_file1, sequence_file2, seqan3::views::minimiser_hash(args.shape, - args.w_size, args.seed_se),seqan3::views::minimiser_hash(args.shape, - seqan3::window_size{args.shape.size()}, args.seed_se), create_name(args), args); - break; - case modmers: match(sequence_file1, sequence_file2, modmer_hash(args.shape, - args.w_size.get(), args.seed_se), modmer_hash(args.shape, - 1, args.seed_se), create_name(args), args); - break; - case strobemer: std::ranges::empty_view empty{}; - if (args.lib_implementation) - { - if (args.rand & (args.order == 2)) - match, 1>(sequence_file1, sequence_file2, empty, create_name(args), args); - else if (args.rand & (args.order == 3)) - match, 2>(sequence_file1, sequence_file2, empty, create_name(args), args); - else if (args.hybrid) - match, 3>(sequence_file1, sequence_file2, empty, create_name(args), args); - else if (args.minstrobers & (args.order == 2)) - match, 4>(sequence_file1, sequence_file2, empty, create_name(args), args); + switch(args.name) + { + case minimiser: { + if (args.hybrid & (args.order == 2)) + match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.hybrid & (args.order == 3)) + match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.minstrobers & (args.order == 2)) + match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.minstrobers & (args.order == 3)) + match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.rand & (args.order == 2)) + match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.rand & (args.order == 3)) + match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + } + break; + case modmers: { + if (args.hybrid & (args.order == 2)) + match_strobemer(sequence_file1, sequence_file2, modmer(args.w_size.get()), hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.hybrid & (args.order == 3)) + match_strobemer(sequence_file1, sequence_file2, modmer(args.w_size.get()), hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.minstrobers & (args.order == 2)) + match_strobemer(sequence_file1, sequence_file2, modmer(args.w_size.get()), minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.minstrobers & (args.order == 3)) + match_strobemer(sequence_file1, sequence_file2, modmer(args.w_size.get()), minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.rand & (args.order == 2)) + match_strobemer(sequence_file1, sequence_file2, modmer(args.w_size.get()),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.rand & (args.order == 3)) + match_strobemer(sequence_file1, sequence_file2, modmer(args.w_size.get()), randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + } + break; + case syncmer: { + if (args.hybrid & (args.order == 2)) + match_syncmer(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.hybrid & (args.order == 3)) + match_syncmer(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.minstrobers & (args.order == 2)) + match_syncmer(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.minstrobers & (args.order == 3)) + match_syncmer(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.rand & (args.order == 2)) + match_syncmer(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + if (args.rand & (args.order == 3)) + match_syncmer(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + } + break; + } + } + else + { + switch(args.name) + { + case kmer: match(sequence_file1, sequence_file2, seqan3::views::kmer_hash(args.shape), create_name(args), args); + break; + case minimiser: match(sequence_file1, sequence_file2, seqan3::views::minimiser_hash(args.shape, + args.w_size, args.seed_se),seqan3::views::minimiser_hash(args.shape, + seqan3::window_size{args.shape.size()}, args.seed_se), create_name(args), args); + break; + case modmers: match(sequence_file1, sequence_file2, modmer_hash(args.shape, + args.w_size.get(), args.seed_se), modmer_hash(args.shape, + 1, args.seed_se), create_name(args), args); + break; + case syncmer: match(sequence_file1, sequence_file2, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), seqan3::views::minimiser_hash(args.shape, + seqan3::window_size{args.shape.size()}, args.seed_se), create_name(args), args); + break; + case strobemer: std::ranges::empty_view empty{}; + if (args.lib_implementation) + { + if (args.rand & (args.order == 2)) + match, 1>(sequence_file1, sequence_file2, empty, create_name(args), args); + else if (args.rand & (args.order == 3)) + match, 2>(sequence_file1, sequence_file2, empty, create_name(args), args); + else if (args.hybrid) + match, 3>(sequence_file1, sequence_file2, empty, create_name(args), args); + else if (args.minstrobers & (args.order == 2)) + match, 4>(sequence_file1, sequence_file2, empty, create_name(args), args); + } + else + { + if (args.hybrid & (args.order == 2)) + match(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.hybrid & (args.order == 3)) + match(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se),create_name(args), args); + else if (args.minstrobers & (args.order == 2)) + match(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.minstrobers & (args.order == 3)) + match(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.rand & (args.order == 2)) + match(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.rand & (args.order == 3)) + match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); } - else - { - if (args.hybrid & (args.order == 2)) - match(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); - else if (args.hybrid & (args.order == 3)) - match(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se),create_name(args), args); - else if (args.minstrobers & (args.order == 2)) - match(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); - else if (args.minstrobers & (args.order == 3)) - match(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); - else if (args.rand & (args.order == 2)) - match(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); - else if (args.rand & (args.order == 3)) - match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); - } + break; + } } } diff --git a/src/main.cpp b/src/main.cpp index 1f15cf3..e6c244c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -186,17 +186,21 @@ int match(seqan3::argument_parser & parser) range_arguments args{}; std::filesystem::path sequence_file1; std::filesystem::path sequence_file2; + bool underlying_strobemer = false; parser.info.short_description = "Counts the number of matches for a given method between the two given files."; parser.add_positional_option(sequence_file1, "Please provide the first sequence file."); parser.add_positional_option(sequence_file2, "Please provide the second sequence file."); all_arguments(parser, args); std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", - seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); + seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer", "syncmer"}); parser.add_flag(args.lib_implementation, '\0', "original", "Set, if you want to use the strobemer implementation from Sahlin."); + parser.add_flag(underlying_strobemer,'\0', "strobemer", "If strobemers should be used as base for representative " + "methods like minimizers. Default: False."); read_range_arguments_minimiser(parser, args); read_range_arguments_strobemers(parser, args); + read_range_arguments_syncmers(parser, args); try { @@ -210,7 +214,7 @@ int match(seqan3::argument_parser & parser) } string_to_methods(method, args.name); - do_match(sequence_file1, sequence_file2, args); + do_match(sequence_file1, sequence_file2, args, underlying_strobemer); return 0; } diff --git a/test/cli/minions_match_test.cpp b/test/cli/minions_match_test.cpp index bd9a140..7db7e7c 100644 --- a/test/cli/minions_match_test.cpp +++ b/test/cli/minions_match_test.cpp @@ -38,6 +38,14 @@ TEST_F(cli_test, modmer) EXPECT_EQ(result.err, std::string{}); } +TEST_F(cli_test, syncmer) +{ + cli_test_result result = execute_app("minions match --method syncmer -k 19 -w 2 -p 0", data("example1.fasta"), data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{"Matches: 1305894\tMissed: 0\nMatch Coverage: 97.9846\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.err, std::string{}); +} + TEST_F(cli_test, strobemer) { cli_test_result result = execute_app("minions match --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --rand", data("example1.fasta"), data("example1.fasta")); @@ -68,7 +76,7 @@ TEST_F(cli_test, wrong_method) std::string expected { "Error. Incorrect command line input for match. Validation failed " - "for option --method: Value submer is not one of [kmer,minimiser,modmer,strobemer].\n" + "for option --method: Value submer is not one of [kmer,minimiser,modmer,strobemer,syncmer].\n" }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); From 2836ed1d57db925626bbc17d4179f0fe625f27b5 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Thu, 9 Feb 2023 14:48:08 +0100 Subject: [PATCH 18/34] Correct distance and distance with strobemers as bases. --- .gitignore | 2 + include/compare.h | 3 +- include/minimiser_hash_distance.hpp | 114 ------------ include/modmer_hash_distance.hpp | 142 -------------- src/compare.cpp | 231 +++++++++++++++++++---- src/main.cpp | 9 +- src/snakemake/distance/Snakefile | 69 +++++++ test/api/CMakeLists.txt | 3 - test/api/minimiser_distance_test.cpp | 245 ------------------------- test/api/modmer_hash_distance_test.cpp | 130 ------------- test/cli/minions_distance_test.cpp | 18 +- 11 files changed, 284 insertions(+), 682 deletions(-) delete mode 100644 include/minimiser_hash_distance.hpp delete mode 100644 include/modmer_hash_distance.hpp create mode 100644 src/snakemake/distance/Snakefile delete mode 100644 test/api/minimiser_distance_test.cpp delete mode 100644 test/api/modmer_hash_distance_test.cpp diff --git a/.gitignore b/.gitignore index 975a331..4b24751 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,8 @@ src/snakemake/ !src/snakemake/accuracy/README !src/snakemake/accuracy/Snakefile +!src/snakemake/distance/Snakefile + !src/snakemake/speed/README !src/snakemake/speed/Snakefile !src/snakemake/speed/plot_speed.py diff --git a/include/compare.h b/include/compare.h index e11c051..b33b9c6 100644 --- a/include/compare.h +++ b/include/compare.h @@ -127,8 +127,9 @@ void do_counts(std::vector sequence_files, range_argument /*! \brief Function, comparing the methods in regard of their distance. * \param sequence_file A sequence file. * \param args The arguments about the view to be used. + * \param underlying_strobemer True, if strobemers should be used with a representative method like minimizer. */ -void do_distance(std::filesystem::path sequence_file, range_arguments & args); +void do_distance(std::filesystem::path sequence_file, range_arguments & args, bool underlying_strobemer = false); /*! \brief Function, counting number of matches between two sequences. * \param sequence_file1 The first sequence file. diff --git a/include/minimiser_hash_distance.hpp b/include/minimiser_hash_distance.hpp deleted file mode 100644 index 3c8ebb5..0000000 --- a/include/minimiser_hash_distance.hpp +++ /dev/null @@ -1,114 +0,0 @@ -// ----------------------------------------------------------------------------------------------------- -// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin -// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik -// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License -// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md -// ----------------------------------------------------------------------------------------------------- - -/*!\file - * \author Mitra Darvish - * \brief Provides seqan3::views::minimiser_distance_hash. - */ - -#pragma once - -#include -#include -#include -#include - -#include "minimiser_distance.hpp" - - -namespace seqan3::detail -{ -//!\brief seqan3::views::minimiser_distance_hash's range adaptor object type (non-closure). -//!\ingroup search_views -struct minimiser_distance_hash_fn -{ - /*!\brief Store the shape and the window size and return a range adaptor closure object. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] window_size The windows size to use. - * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. - * \returns A range of converted elements. - */ - constexpr auto operator()(shape const & shape, window_size const window_size) const - { - return seqan3::detail::adaptor_from_functor{*this, shape, window_size}; - } - - /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] window_size The size of the window. - * \param[in] seed The seed to use. - * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. - * \returns A range of converted elements. - */ - constexpr auto operator()(shape const & shape, window_size const window_size, seed const seed) const - { - return seqan3::detail::adaptor_from_functor{*this, shape, window_size, seed}; - } - - /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type - * of the range must model seqan3::semialphabet. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] window_size The size of the window. - * \param[in] seed The seed to use. - * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. - * \returns A range of converted elements. - */ - template - constexpr auto operator()(urng_t && urange, - shape const & shape, - window_size const window_size, - seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const - { - static_assert(std::ranges::viewable_range, - "The range parameter to views::minimiser_distance_hash cannot be a temporary of a non-view range."); - static_assert(std::ranges::forward_range, - "The range parameter to views::minimiser_distance_hash must model std::ranges::forward_range."); - static_assert(semialphabet>, - "The range parameter to views::minimiser_distance_hash must be over elements of seqan3::semialphabet."); - - if (shape.size() > window_size.get()) - throw std::invalid_argument{"The size of the shape cannot be greater than the window size."}; - - auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}); - - auto reverse_strand = std::forward(urange) | seqan3::views::complement - | std::views::reverse - | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}) - | std::views::reverse; - - return minimiser_distance_view(forward_strand, reverse_strand, window_size.get() - shape.size() + 1); - } -}; - -} // namespace seqan3::detail - - -/*!\name Alphabet related views - * \{ - */ - -/*!\brief Computes the distance of minimisers for a range with a given shape, window size and seed. - * \tparam urng_t The type of the range being processed. - * \param[in] urange The range being processed. [parameter is omitted in pipe notation] - * \param[in] shape The seqan3::shape that determines how to compute the hash value. - * \param[in] window_size The window size to use. - * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. - * \returns A range of `size_t` where each value is the minimiser of the resp. window. - * See below for the properties of the returned range. - * \ingroup utility_views - * - * \details - * For more information look into seqan3::views::minimiser_hash - */ -inline constexpr auto minimiser_hash_distance = seqan3::detail::minimiser_distance_hash_fn{}; - -//!\} diff --git a/include/modmer_hash_distance.hpp b/include/modmer_hash_distance.hpp deleted file mode 100644 index c448ba3..0000000 --- a/include/modmer_hash_distance.hpp +++ /dev/null @@ -1,142 +0,0 @@ -// ----------------------------------------------------------------------------------------------------- -// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin -// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik -// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License -// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md -// ----------------------------------------------------------------------------------------------------- - -/*!\file - * \author Mitra Darvish - * \brief Provides modmer_hash. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "modmer.hpp" -#include "shared.hpp" - -namespace seqan3::detail -{ -//!\brief seqan3::views::modmer_hash's range adaptor object type (non-closure). -//!\ingroup search_views -struct modmer_hash_distance_fn -{ - /*!\brief Store the shape and the window size and return a range adaptor closure object. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] mod_used The mod value to use. - * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. - * \returns A range of converted elements. - */ - constexpr auto operator()(shape const & shape, uint32_t const mod_used) const - { - return seqan3::detail::adaptor_from_functor{*this, shape, mod_used}; - } - - /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] mod_used The mod value to use. - * \param[in] seed The seed to use. - * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. - * \returns A range of converted elements. - */ - constexpr auto operator()(shape const & shape, uint32_t const mod_used, seed const seed) const - { - return seqan3::detail::adaptor_from_functor{*this, shape, mod_used, seed}; - } - - /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. - * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type - * of the range must model seqan3::semialphabet. - * \param[in] shape The seqan3::shape to use for hashing. - * \param[in] mod_used The mod value to use. - * \param[in] seed The seed to use. - * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. - * \returns A range of converted elements. - */ - template - constexpr auto operator()(urng_t && urange, - shape const & shape, - uint32_t const mod_used, - seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const - { - static_assert(std::ranges::viewable_range, - "The range parameter to views::modmer_hash cannot be a temporary of a non-view range."); - static_assert(std::ranges::forward_range, - "The range parameter to views::modmer_hash must model std::ranges::forward_range."); - static_assert(semialphabet>, - "The range parameter to views::modmer_hash must be over elements of seqan3::semialphabet."); - - if (mod_used == 1) // Would just return urange1 without any changes - throw std::invalid_argument{"The chosen mod_used is not valid. " - "Please choose a value greater than 1."}; - - auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}); - - auto reverse_strand = std::forward(urange) | seqan3::views::complement - | std::views::reverse - | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}) - | std::views::reverse; - - auto combined_strand = seqan3::views::zip(forward_strand, reverse_strand) | std::views::transform([seed](std::tuple i){return fnv_hash(std::get<0>(i) + std::get<1>(i), seed.get());}); - return seqan3::detail::modmer_view(combined_strand, mod_used); - } -}; - -} // namespace seqan3::detail - -/*!\name Alphabet related views - * \{ - */ - -/*!\brief Computes the distance of modmers for a range with a given shape, mod_used and seed. - * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is - * omitted in pipe notation] - * \param[in] urange The range being processed. [parameter is omitted in pipe notation] - * \param[in] shape The seqan3::shape that determines how to compute the hash value. - * \param[in] mod_used The mod value to use. - * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. - * \returns A range of `size_t` where each value is the modmer of the resp. window. - * See below for the properties of the returned range. - * \ingroup search_views - * - * \attention - * Be aware of the requirements of the seqan3::views::kmer_hash view. - * - * - * ### View properties - * - * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | - * |----------------------------------|:----------------------------------:|:--------------------------------:| - * | std::ranges::input_range | *required* | *preserved* | - * | std::ranges::forward_range | *required* | *preserved* | - * | std::ranges::bidirectional_range | | *lost* | - * | std::ranges::random_access_range | | *lost* | - * | std::ranges::contiguous_range | | *lost* | - * | | | | - * | std::ranges::viewable_range | *required* | *guaranteed* | - * | std::ranges::view | | *guaranteed* | - * | std::ranges::sized_range | | *lost* | - * | std::ranges::common_range | | *lost* | - * | std::ranges::output_range | | *lost* | - * | seqan3::const_iterable_range | | *preserved* | - * | | | | - * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | - * - * See the views views submodule documentation for detailed descriptions of the view properties. - * - * \hideinitializer - * - */ -inline constexpr auto modmer_hash_distance = seqan3::detail::modmer_hash_distance_fn{}; - -//!\} diff --git a/src/compare.cpp b/src/compare.cpp index 1df095c..e992389 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -9,10 +9,8 @@ #include "compare.h" #include "hybridstrobe_hash.hpp" -#include "minimiser_hash_distance.hpp" #include "minstrobe_hash.hpp" #include "modmer_hash.hpp" -#include "modmer_hash_distance.hpp" #include "randstrobe_hash.hpp" #include "syncmer_hash.hpp" @@ -265,34 +263,6 @@ void counts_strobemer(std::vector & sequence_files, urng_ outfile.close(); } -/*! \brief Function, get the coverage of one sequence file for a representative method. - * \param sequence_file A sequence file. - * \param distance_view View that returns distances. - * \param method_name Name of the tested method. - * \param args The arguments about the view to be used, needed for strobemers. - */ -template -void compare_cov2(std::filesystem::path sequence_file, urng_t distance_view, std::string method_name, range_arguments & args) -{ - std::vector coverage{}; - std::vector stdev{}; - std::ofstream outfile; - - seqan3::sequence_file_input> fin{sequence_file}; - for (auto & [seq] : fin) - { - for (auto && hash : seq | distance_view) - coverage.push_back(hash); - } - double mean_coverage, stdev_coverage; - get_mean_and_var(coverage, mean_coverage, stdev_coverage); - - // Store coverage - outfile.open(std::string{args.path_out} + method_name + "_coverage.out"); - outfile << method_name << "\t" << *std::min_element(coverage.begin(), coverage.end()) << "\t" << mean_coverage << "\t" << stdev_coverage << "\t" << *std::max_element(coverage.begin(), coverage.end()) << "\n"; - outfile.close(); -} - template std::vector read_seq_file(std::filesystem::path sequence_file, urng_t input_view, range_arguments & args) { @@ -349,6 +319,131 @@ std::vector read_seq_file(std::filesystem::path sequence_file, range_a return vector; } +template +void distance(std::filesystem::path sequence_file, urng_t input_view, urng_t2 compare_view) +{ + std::vector distances{}; + int distance = 0; + int it_1 = 0; + int it_2 = 0; + + std::vector vector{}; + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [seq] : fin) + { + int distance = 0; + + auto representative = seq | input_view; + auto rep_it = representative.begin(); + auto compare = seq | compare_view; + auto comp_it = compare.begin(); + while((rep_it != representative.end()) & (comp_it != compare.end())) + { + if (*rep_it == *comp_it) + { + if (comp_it != compare.begin()) + { + distances.push_back(distance); + distance = 0; + } + rep_it++; + } + else + { + distance++; + } + comp_it++; + } + } + + double mean_distance, stdev_distance; + get_mean_and_var(distances, mean_distance, stdev_distance); + std::cout << "Distances: " << *std::min_element(distances.begin(), distances.end()) << "\t" << mean_distance << "\t" << stdev_distance << "\t" << *std::max_element(distances.begin(), distances.end()) << "\n"; +} + +template +void distance_strobemer(std::filesystem::path sequence_file, urng_t input_view, urng_t2 compare_view) +{ + std::vector distances{}; + int distance = 0; + int it_1 = 0; + int it_2 = 0; + + std::vector vector{}; + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [seq] : fin) + { + int distance = 0; + + auto rep1 = seq | compare_view; + auto representative = rep1 | input_view; + auto rep_it = representative.begin(); + auto compare = seq | compare_view; + auto comp_it = compare.begin(); + do + { + if (*rep_it == *comp_it) + { + if (comp_it != compare.begin()) + { + distances.push_back(distance); + distance = 0; + } + rep_it++; + } + else + { + distance++; + } + comp_it++; + } + while((rep_it != representative.end()) & (comp_it != compare.end())); + } + + double mean_distance, stdev_distance; + get_mean_and_var(distances, mean_distance, stdev_distance); + std::cout << "Distances: " << *std::min_element(distances.begin(), distances.end()) << "\t" << mean_distance << "\t" << stdev_distance << "\t" << *std::max_element(distances.begin(), distances.end()) << "\n"; +} + +template +void distance_syncmer(std::filesystem::path sequence_file, urng_t input_view, range_arguments & args) +{ + std::vector distances{}; + int distance = 0; + int it_1 = 0; + int it_2 = 0; + + std::vector vector{}; + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [seq] : fin) + { + int distance = 0; + auto representative = seq | input_view; + auto rep_it = representative.begin(); + do + { + if (syncmer_filter(*rep_it, args.w_size.get(), (args.k_size *args.order), args.positions, args.seed_se.get())) + { + if (rep_it != representative.begin()) + { + distances.push_back(distance); + distance = 0; + } + } + else + { + distance++; + } + rep_it++; + } + while(rep_it != representative.end()); + } + + double mean_distance, stdev_distance; + get_mean_and_var(distances, mean_distance, stdev_distance); + std::cout << "Distances: " << *std::min_element(distances.begin(), distances.end()) << "\t" << mean_distance << "\t" << stdev_distance << "\t" << *std::max_element(distances.begin(), distances.end()) << "\n"; +} + void fill_positions(std::vector & positions, int pos, int match_length) { for(int j = pos; j < pos+match_length; j++) @@ -817,16 +912,73 @@ void do_counts(std::vector sequence_files, range_argument } } -void do_distance(std::filesystem::path sequence_file, range_arguments & args) +void do_distance(std::filesystem::path sequence_file, range_arguments & args, bool underlying_strobemer) { - switch(args.name) + if (underlying_strobemer) { - case minimiser: compare_cov2(sequence_file, minimiser_hash_distance(args.shape, - args.w_size, args.seed_se), create_name(args), args); - break; - case modmers: compare_cov2(sequence_file, modmer_hash_distance(args.shape, - args.w_size.get(), args.seed_se), create_name(args), args); - break; + switch(args.name) + { + case minimiser: { + if (args.hybrid & (args.order == 2)) + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.hybrid & (args.order == 3)) + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.minstrobers & (args.order == 2)) + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.minstrobers & (args.order == 3)) + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.rand & (args.order == 2)) + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.rand & (args.order == 3)) + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + } + break; + case modmers: { + if (args.hybrid & (args.order == 2)) + distance_strobemer(sequence_file, modmer(args.w_size.get()), hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.hybrid & (args.order == 3)) + distance_strobemer(sequence_file, modmer(args.w_size.get()), hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.minstrobers & (args.order == 2)) + distance_strobemer(sequence_file, modmer(args.w_size.get()), minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.minstrobers & (args.order == 3)) + distance_strobemer(sequence_file, modmer(args.w_size.get()), minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.rand & (args.order == 2)) + distance_strobemer(sequence_file, modmer(args.w_size.get()),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + if (args.rand & (args.order == 3)) + distance_strobemer(sequence_file, modmer(args.w_size.get()), randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + } + break; + case syncmer: { + if (args.hybrid & (args.order == 2)) + distance_syncmer(sequence_file, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + if (args.hybrid & (args.order == 3)) + distance_syncmer(sequence_file, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + if (args.minstrobers & (args.order == 2)) + distance_syncmer(sequence_file, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + if (args.minstrobers & (args.order == 3)) + distance_syncmer(sequence_file, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + if (args.rand & (args.order == 2)) + distance_syncmer(sequence_file, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + if (args.rand & (args.order == 3)) + distance_syncmer(sequence_file, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + } + break; + } + } + else + { + switch(args.name) + { + case minimiser: distance(sequence_file, seqan3::views::minimiser_hash(args.shape, args.w_size, args.seed_se), + seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se)); + break; + case modmers: distance(sequence_file, modmer_hash(args.shape, args.w_size.get(), args.seed_se), + modmer_hash(args.shape, 1, args.seed_se)); + break; + case syncmer: distance(sequence_file, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), + seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se)); + break; + } } } @@ -896,8 +1048,7 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen seqan3::window_size{args.shape.size()}, args.seed_se), create_name(args), args); break; case modmers: match(sequence_file1, sequence_file2, modmer_hash(args.shape, - args.w_size.get(), args.seed_se), modmer_hash(args.shape, - 1, args.seed_se), create_name(args), args); + args.w_size.get(), args.seed_se), modmer_hash(args.shape, 1, args.seed_se), create_name(args), args); break; case syncmer: match(sequence_file1, sequence_file2, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se), create_name(args), args); diff --git a/src/main.cpp b/src/main.cpp index e6c244c..8ef8dc5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -154,15 +154,20 @@ int distance(seqan3::argument_parser & parser) { range_arguments args{}; std::filesystem::path sequence_file{}; + bool underlying_strobemer = false; parser.info.short_description = "Estimates the distance of the singular submers to each other for different methods."; parser.add_positional_option(sequence_file, "Please provide one sequence file."); all_arguments(parser, args); std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, - seqan3::value_list_validator{"minimiser", "modmer"}); + seqan3::value_list_validator{"minimiser", "modmer", "syncmer"}); + parser.add_flag(underlying_strobemer,'\0', "strobemer", "If strobemers should be used as base for representative " + "methods like minimizers. Default: False."); read_range_arguments_minimiser(parser, args); + read_range_arguments_strobemers(parser, args); + read_range_arguments_syncmers(parser, args); try { @@ -176,7 +181,7 @@ int distance(seqan3::argument_parser & parser) } string_to_methods(method, args.name); - do_distance(sequence_file, args); + do_distance(sequence_file, args, underlying_strobemer); return 0; } diff --git a/src/snakemake/distance/Snakefile b/src/snakemake/distance/Snakefile new file mode 100644 index 0000000..4be23e7 --- /dev/null +++ b/src/snakemake/distance/Snakefile @@ -0,0 +1,69 @@ +rule all: + input: + # Representative + ["0_minimiser_hash_20_"+str(w)+"_distance_"+str(error)+".out" for w in range(24,44,4) for error in [1,2,5,10]], + ["0_modmer_hash_20_"+str(w)+"_distance_"+str(error)+".out" for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["syncmer_hash_20_"+str(w)+"_0_0_distance_"+str(error)+".out" for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["syncmer_hash_20_"+str(w)+"_0_6_distance_"+str(error)+".out" for w in [15,11,7,3,1] for error in [1,2,5,10]], + # Representative based on strobemers + ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]] + +rule download_human_genome: + output: + "../results/GRCh38.p13.genome.fa.gz" + shell: + """wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/GRCh38.p13.genome.fa.gz" -O "../results/GRCh38.p13.genome.fa.gz" """ + +rule distance_minimiser_modmer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "{shape}_{method}_hash_{kmer_size}_{w_size}_distance_{error}.out" + wildcard_constraints: + shape='[0-9]*', + method='(modmer|minimiser)' + shell: + "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape {wildcards.shape} -o {wildcards.shape}_ {input} > {wildcards.shape}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_distance_{wildcards.error}.out" + +rule distance_syncmer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_distance_{error}.out", + shell: + "minions distance --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input} > syncmer_hash_{wildcards.kmer_size}_{wildcards.w_size}_{wildcards.pos_begin}_{wildcards.pos_end}_distance_{wildcards.error}.out" + +rule distance_strobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}_distance_{error}.out" + wildcard_constraints: + method='(min|rand|hybrid)', + kmer_size='[0-9]*', + order='(2|3)', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions distance --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input} > {wildcards.method}strobemers_{wildcards.kmer_size}_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_distance_{wildcards.error}.out" + +rule distance_minimiser_modmer_strobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "{method2}strobemers_{order}_{wmin}_{wmax}_{method}_hash_{kmer_size}_{w_size}_distance_{error}.out" + wildcard_constraints: + method='(modmer|minimiser)', + method2='(min|rand|hybrid)', + order='(2|3)', + kmer_size='[0-9]*', + w_size='[0-9]*', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} {input} > {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_distance_{wildcards.error}.out" diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt index ce40e5a..daafff1 100644 --- a/test/api/CMakeLists.txt +++ b/test/api/CMakeLists.txt @@ -6,14 +6,11 @@ target_use_datasources (comparison_test FILES example1.fasta example.ibf expecte add_api_test (hybridstrobe_test.cpp) add_api_test (hybridstrobe_hash_test.cpp) -add_api_test (minimiser_distance_test.cpp) - add_api_test (minstrobe_test.cpp) add_api_test (minstrobe_hash_test.cpp) add_api_test (modmer_test.cpp) add_api_test (modmer_hash_test.cpp) -add_api_test (modmer_hash_distance_test.cpp) add_api_test (randstrobe_test.cpp) add_api_test (randstrobe_hash_test.cpp) diff --git a/test/api/minimiser_distance_test.cpp b/test/api/minimiser_distance_test.cpp deleted file mode 100644 index 9123c93..0000000 --- a/test/api/minimiser_distance_test.cpp +++ /dev/null @@ -1,245 +0,0 @@ -// ----------------------------------------------------------------------------------------------------- -// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin -// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik -// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License -// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md -// ----------------------------------------------------------------------------------------------------- - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "../../lib/seqan3/test/unit/range/iterator_test_template.hpp" - -#include "minimiser_hash_distance.hpp" - -using seqan3::operator""_dna4; -using seqan3::operator""_shape; -using result_t = std::vector; - -inline static constexpr auto kmer_view = seqan3::views::kmer_hash(seqan3::ungapped{4}); -inline static constexpr auto rev_kmer_view = seqan3::views::complement | std::views::reverse - | kmer_view - | std::views::reverse; -inline static constexpr auto gapped_kmer_view = seqan3::views::kmer_hash(0b1001_shape); -inline static constexpr auto rev_gapped_kmer_view = seqan3::views::complement | std::views::reverse - | seqan3::views::kmer_hash(0b1001_shape) - | std::views::reverse; -inline static constexpr auto minimiser_view1 = minimiser_distance(1); // kmer_size == window_size -inline static constexpr auto minimiser_no_rev_view = minimiser_distance(5); - -using iterator_type = std::ranges::iterator_t< decltype(std::declval() - | kmer_view - | minimiser_no_rev_view)>; -using two_ranges_iterator_type = std::ranges::iterator_t< decltype(seqan3::detail::minimiser_distance_view{ - std::declval() - | kmer_view, - std::declval() - | rev_kmer_view, - 5})>; - -template <> -struct iterator_fixture : public ::testing::Test -{ - using iterator_tag = std::forward_iterator_tag; - static constexpr bool const_iterable = true; - - seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; - decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; - result_t expected_range{0, 3, 1}; - - decltype(minimiser_distance(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 5)) test_range = - minimiser_distance(vec, 5); -}; - -template <> -struct iterator_fixture : public ::testing::Test -{ - using iterator_tag = std::forward_iterator_tag; - static constexpr bool const_iterable = true; - - seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; - using kmer_hash_view_t = decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})); - - kmer_hash_view_t vec = kmer_view(text); - result_t expected_range{0, 3, 1, 0, 0}; - - using reverse_kmer_hash_view_t = decltype(rev_kmer_view(text)); - - using test_range_t = decltype(seqan3::detail::minimiser_distance_view{kmer_hash_view_t{}, reverse_kmer_hash_view_t{}, 5}); - test_range_t test_range = seqan3::detail::minimiser_distance_view{vec, rev_kmer_view(text), 5}; -}; - -using test_types = ::testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(iterator_fixture, iterator_fixture, test_types, ); - -template -class minimiser_view_properties_test: public ::testing::Test { }; - -using underlying_range_types = ::testing::Types, - std::vector const, - seqan3::bitpacked_sequence, - seqan3::bitpacked_sequence const, - std::list, - std::list const, - std::forward_list, - std::forward_list const>; -TYPED_TEST_SUITE(minimiser_view_properties_test, underlying_range_types, ); - -class minimiser_test : public ::testing::Test -{ -protected: - std::vector text1{"AAAAAAAAAAAAAAAAAAA"_dna4}; - std::vector text1_short{"AAAAAA"_dna4}; - result_t result1{4, 4, 4}; // Same result for ungapped and gapped - result_t result1_short{15}; - - std::vector too_short_text{"AC"_dna4}; - - std::vector text3{"ACGGCGACGTTTAG"_dna4}; - result_t result3_ungapped{0, 3, 1, 0, 0}; // ACGG, CGAC, ACGT, aacg, aaac - lowercase for reverse complement - result_t result3_gapped{0, 4, 0, 0, 0}; // A--G, c--c, A--T, a--g, a--c - "-" for gap - result_t result3_ungapped_no_rev{0, 3, 1}; // ACGG, CGAC, ACGT - result_t result3_gapped_no_rev{0, 3, 1}; // A--G, C--C-, A--T "-" for gap - result_t result3_stop{0, 3}; // For stop at first T - result_t result3_gapped_stop{0, 4}; // A--G, c--c - result_t result3_start{2}; // For start at second A, ungapped and gapped the same - result_t result3_ungapped_no_rev_start{0}; // For start at second A - result_t result3_gapped_no_rev_start{0}; // For start at second A -}; - -template -void compare_types(adaptor_t v) -{ - EXPECT_TRUE(std::ranges::input_range); - EXPECT_TRUE(std::ranges::forward_range); - EXPECT_FALSE(std::ranges::bidirectional_range); - EXPECT_FALSE(std::ranges::random_access_range); - EXPECT_TRUE(std::ranges::view); - EXPECT_FALSE(std::ranges::sized_range); - EXPECT_FALSE(std::ranges::common_range); - EXPECT_TRUE(seqan3::const_iterable_range); - EXPECT_FALSE((std::ranges::output_range)); -} - -TYPED_TEST(minimiser_view_properties_test, concepts) -{ - TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, - 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG - - auto v = text | kmer_view | minimiser_no_rev_view; - compare_types(v); - auto v2 = seqan3::detail::minimiser_distance_view{text | kmer_view, text | kmer_view, 5}; - - if constexpr (std::ranges::bidirectional_range) // excludes forward_list - { - auto v3 = seqan3::detail::minimiser_distance_view{text | kmer_view, text | rev_kmer_view, 5}; - compare_types(v3); - } -} - -TYPED_TEST(minimiser_view_properties_test, different_inputs_kmer_hash) -{ - TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, - 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG - result_t ungapped{0, 3, 1, 0, 0}; // ACGT, CGAC, ACGT, aacg, aaac - lowercase for reverse comp. - result_t gapped{0, 4, 0, 0, 0}; // A--T, c--c, A--T, a--g, a--c - "-" for gap - result_t ungapped_no_rev{0, 3, 1}; // ACGT, CGAC, ACGT - result_t gapped_no_rev{0, 3, 1}; // A--T, C--C, A--T - "-" for gap - EXPECT_RANGE_EQ(ungapped_no_rev, text | kmer_view | minimiser_no_rev_view); - EXPECT_RANGE_EQ(gapped_no_rev, text | gapped_kmer_view | minimiser_no_rev_view); - - if constexpr (std::ranges::bidirectional_range) // excludes forward_list - { - EXPECT_RANGE_EQ(ungapped, (seqan3::detail::minimiser_distance_view{text | kmer_view, text | rev_kmer_view, 5})) ; - EXPECT_RANGE_EQ(gapped, (seqan3::detail::minimiser_distance_view{text | gapped_kmer_view, text | rev_gapped_kmer_view, 5})); - } -} - -TEST_F(minimiser_test, ungapped_kmer_hash) -{ - EXPECT_RANGE_EQ(result1, (seqan3::detail::minimiser_distance_view{text1 | kmer_view, text1 | rev_kmer_view, 5})); - EXPECT_RANGE_EQ(result1, text1 | kmer_view | minimiser_no_rev_view); - EXPECT_THROW(text1_short | kmer_view | minimiser_view1, std::invalid_argument); - auto empty_view = seqan3::detail::minimiser_distance_view{too_short_text | kmer_view, too_short_text | rev_kmer_view, 5}; - EXPECT_TRUE(std::ranges::empty(empty_view)); - auto empty_view2 = too_short_text | kmer_view | minimiser_no_rev_view; - EXPECT_TRUE(std::ranges::empty(empty_view2)); - EXPECT_RANGE_EQ(result3_ungapped, (seqan3::detail::minimiser_distance_view{text3 | kmer_view, text3 | rev_kmer_view, 5})); - EXPECT_RANGE_EQ(result3_ungapped_no_rev, text3 | kmer_view | minimiser_no_rev_view); - EXPECT_THROW((text3 | minimiser_hash_distance(seqan3::ungapped{4}, seqan3::window_size{3})), std::invalid_argument); - -} - -TEST_F(minimiser_test, gapped_kmer_hash) -{ - EXPECT_RANGE_EQ(result1, (seqan3::detail::minimiser_distance_view{text1 | gapped_kmer_view, - text1 | rev_gapped_kmer_view, - 5})); - EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | minimiser_no_rev_view); - EXPECT_THROW(text1_short | gapped_kmer_view | minimiser_view1, std::invalid_argument); - auto empty_view = seqan3::detail::minimiser_distance_view{too_short_text | gapped_kmer_view, - too_short_text | rev_gapped_kmer_view, - 5}; - EXPECT_TRUE(std::ranges::empty(empty_view)); - auto empty_view2 = too_short_text | gapped_kmer_view | minimiser_no_rev_view; - EXPECT_TRUE(std::ranges::empty(empty_view2)); - EXPECT_RANGE_EQ(result3_gapped, (seqan3::detail::minimiser_distance_view{text3 | gapped_kmer_view, - text3 | rev_gapped_kmer_view, - 5})); - EXPECT_RANGE_EQ(result3_gapped_no_rev, text3 | gapped_kmer_view | minimiser_no_rev_view); - EXPECT_THROW((text3 | minimiser_hash_distance(0b1001_shape, seqan3::window_size{3})), std::invalid_argument); -} - -TEST_F(minimiser_test, window_too_big) -{ - EXPECT_RANGE_EQ(result1_short, text1 | kmer_view | minimiser_distance(20)); - EXPECT_RANGE_EQ(result1_short, text1 | gapped_kmer_view | minimiser_distance(20)); - EXPECT_RANGE_EQ(result1_short, (seqan3::detail::minimiser_distance_view{text1 | kmer_view, text1 | rev_kmer_view, 20})); - EXPECT_RANGE_EQ(result1_short, (seqan3::detail::minimiser_distance_view{text1 | gapped_kmer_view, - text1 | rev_gapped_kmer_view, - 20})); -} - -TEST_F(minimiser_test, combinability) -{ - auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); - EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | kmer_view | minimiser_no_rev_view); - EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | gapped_kmer_view | minimiser_no_rev_view); - - EXPECT_RANGE_EQ(result3_stop, (seqan3::detail::minimiser_distance_view{text3 | stop_at_t | kmer_view, - text3 | stop_at_t | rev_kmer_view, - 5})); - EXPECT_RANGE_EQ(result3_gapped_stop, (seqan3::detail::minimiser_distance_view{text3 | stop_at_t | gapped_kmer_view, - text3 | stop_at_t | rev_gapped_kmer_view, - 5})); - - auto start_at_a = std::views::drop(6); - EXPECT_RANGE_EQ(result3_start, (seqan3::detail::minimiser_distance_view{text3 | start_at_a | kmer_view, - text3 | start_at_a | rev_kmer_view, - 5})); - EXPECT_RANGE_EQ(result3_start, (seqan3::detail::minimiser_distance_view{text3 | start_at_a | gapped_kmer_view, - text3 | start_at_a | rev_gapped_kmer_view, - 5})); -} - -/*TEST_F(minimiser_test, non_arithmetic_value) -{ - // just compute the minimizer directly on the alphabet - EXPECT_RANGE_EQ("ACACA"_dna4, text3 | minimiser_no_rev_view); -}*/ - -TEST_F(minimiser_test, two_ranges_unequal_size) -{ - EXPECT_THROW((seqan3::detail::minimiser_distance_view{text1 | kmer_view, text3 | rev_kmer_view, 5}), std::invalid_argument); -} diff --git a/test/api/modmer_hash_distance_test.cpp b/test/api/modmer_hash_distance_test.cpp deleted file mode 100644 index 4260f7d..0000000 --- a/test/api/modmer_hash_distance_test.cpp +++ /dev/null @@ -1,130 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include "../../lib/seqan3/test/unit/range/iterator_test_template.hpp" - -#include "modmer_hash_distance.hpp" - -using seqan3::operator""_dna4; -using seqan3::operator""_shape; -using result_t = std::vector; - -using iterator_type = std::ranges::iterator_t() - | modmer_hash_distance(seqan3::ungapped{4}, - 2, - seqan3::seed{0}))>; - -static constexpr seqan3::shape ungapped_shape = seqan3::ungapped{4}; -static constexpr seqan3::shape gapped_shape = 0b1001_shape; -static constexpr auto ungapped_view = modmer_hash_distance(ungapped_shape, - 2, - seqan3::seed{0}); -static constexpr auto gapped_view = modmer_hash_distance(gapped_shape, - 2, - seqan3::seed{0}); - -template <> -struct iterator_fixture : public ::testing::Test -{ - using iterator_tag = std::forward_iterator_tag; - static constexpr bool const_iterable = false; - - seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; - result_t expected_range{6, 1, 0, 0}; - - using test_range_t = decltype(text | ungapped_view); - test_range_t test_range = text | ungapped_view; -}; - -using test_type = ::testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(iterator_fixture, iterator_fixture, test_type, ); - -template -class modmer_hash_distance_view_properties_test: public ::testing::Test { }; - -using underlying_range_types = ::testing::Types, - std::vector const, - seqan3::bitpacked_sequence, - seqan3::bitpacked_sequence const, - std::list, - std::list const>; - -TYPED_TEST_SUITE(modmer_hash_distance_view_properties_test, underlying_range_types, ); - -class modmer_hash_distance_test : public ::testing::Test -{ -protected: - std::vector text1{"AAAAAA"_dna4}; - result_t result1{}; // Same result for ungapped and gapped - - std::vector too_short_text{"AC"_dna4}; - - // ACGG CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG - // CCGT GCCG CGCC TCGC GTCG CGTC ACGT AACG AAAC TAAA CTAA - // ACGG CGGC cgcc GCGA CGAC cgtc ACGT aacg aaac taaa ctaa - std::vector text3{"ACGGCGACGTTTAG"_dna4}; - result_t result3{6, 1, 0, 0}; // ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA // A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap - result_t result3_stop{}; - result_t result3_start{0, 1, 0, 0}; -}; - -template -void compare_types(adaptor_t v) -{ - EXPECT_TRUE(std::ranges::input_range); - EXPECT_TRUE(std::ranges::forward_range); - EXPECT_FALSE(std::ranges::bidirectional_range); - EXPECT_FALSE(std::ranges::random_access_range); - EXPECT_TRUE(std::ranges::view); - EXPECT_FALSE(std::ranges::sized_range); - EXPECT_FALSE(std::ranges::common_range); - EXPECT_TRUE(seqan3::const_iterable_range); - EXPECT_FALSE((std::ranges::output_range)); -} - -TYPED_TEST(modmer_hash_distance_view_properties_test, different_input_ranges) -{ - TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, - 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG - result_t ungapped{0, 2, 2, 1, 0, 0}; // ACGT/ACGT, TCGA/TCGA, ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA - result_t gapped{0, 2, 2, 1, 0, 0}; // A--T/A--T, T--A/T--A, A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap - EXPECT_RANGE_EQ(ungapped, text | ungapped_view); - EXPECT_RANGE_EQ(gapped, text | gapped_view); -} - -TEST_F(modmer_hash_distance_test, ungapped) -{ - EXPECT_RANGE_EQ(result1, text1 | ungapped_view); - EXPECT_TRUE(std::ranges::empty(too_short_text | ungapped_view)); - EXPECT_RANGE_EQ(result3, text3 | ungapped_view); -} - -TEST_F(modmer_hash_distance_test, gapped) -{ - EXPECT_RANGE_EQ(result1, text1 | gapped_view); - EXPECT_TRUE(std::ranges::empty(too_short_text | gapped_view)); - EXPECT_RANGE_EQ(result3, text3 | gapped_view); -} - -TEST_F(modmer_hash_distance_test, combinability) -{ - auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); - EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | ungapped_view); - EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | gapped_view); - - auto start_at_a = std::views::drop(6); - EXPECT_RANGE_EQ(result3_start, text3 | start_at_a | ungapped_view); - EXPECT_RANGE_EQ(result3_start, text3 | start_at_a | gapped_view); -} diff --git a/test/cli/minions_distance_test.cpp b/test/cli/minions_distance_test.cpp index 7ad532f..1d46c44 100644 --- a/test/cli/minions_distance_test.cpp +++ b/test/cli/minions_distance_test.cpp @@ -18,7 +18,7 @@ TEST_F(cli_test, minimiser) { cli_test_result result = execute_app("minions distance --method minimiser -k 19 -w 19 ", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.out, std::string{"Distances: 0\t0\t0\t0\n"}); EXPECT_EQ(result.err, std::string{}); } @@ -26,15 +26,23 @@ TEST_F(cli_test, gapped_minimiser) { cli_test_result result = execute_app("minions distance --method minimiser -k 19 -w 19 --shape 524223", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.out, std::string{"Distances: 0\t0\t0\t0\n"}); EXPECT_EQ(result.err, std::string{}); } TEST_F(cli_test, modmer) { - cli_test_result result = execute_app("minions distance --method modmer -k 19 -w 2 ", data("example1.fasta")); + cli_test_result result = execute_app("minions distance --method modmer -k 19 -w 2", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.out, std::string{"Distances: 0\t1.03566\t1.51183\t49\n"}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, syncmer) +{ + cli_test_result result = execute_app("minions distance --method syncmer -k 19 -w 2 -p 0", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{"Distances: 0\t3.74393\t4.88286\t40\n"}); EXPECT_EQ(result.err, std::string{}); } @@ -44,7 +52,7 @@ TEST_F(cli_test, wrong_method) std::string expected { "Error. Incorrect command line input for distance. Validation failed " - "for option --method: Value submer is not one of [minimiser,modmer].\n" + "for option --method: Value submer is not one of [minimiser,modmer,syncmer].\n" }; EXPECT_EQ(result.exit_code, 0); From c66e70b6adb71691e11c79fb1a4aa64cbfabd646 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Thu, 9 Feb 2023 15:14:30 +0100 Subject: [PATCH 19/34] Add strobemer as option to accuracy. --- src/compare.cpp | 15 ++++++++++++++- src/main.cpp | 3 ++- test/cli/minions_accuracy_test.cpp | 26 +++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/compare.cpp b/src/compare.cpp index e992389..87be16d 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -820,7 +820,20 @@ void do_accuracy(accuracy_arguments & args) case syncmer: accuracy(syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), create_name(args), args); break; - + case strobemer: { + if (args.hybrid & (args.order == 2)) + accuracy(hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.hybrid & (args.order == 3)) + accuracy(hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.minstrobers & (args.order == 2)) + accuracy(minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.minstrobers & (args.order == 3)) + accuracy(minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.rand & (args.order == 2)) + accuracy(randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + else if (args.rand & (args.order == 3)) + accuracy(randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); + } } } diff --git a/src/main.cpp b/src/main.cpp index 8ef8dc5..1fe7813 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -79,7 +79,7 @@ int accuracy(seqan3::argument_parser & parser) std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, - seqan3::value_list_validator{"kmer", "minimiser", "modmer", "syncmer"}); + seqan3::value_list_validator{"kmer", "minimiser", "modmer", "syncmer", "strobemer"}); parser.add_option(args.search_file, '\0', "search-file", "A sequence files with sequences to search for.", seqan3::option_spec::required); parser.add_option(args.solution_file, '\0', "solution-file", "A file giving the correct files a sequence should be find in.", @@ -94,6 +94,7 @@ int accuracy(seqan3::argument_parser & parser) seqan3::option_spec::advanced); read_range_arguments_minimiser(parser, args); + read_range_arguments_strobemers(parser, args); read_range_arguments_syncmers(parser, args); try diff --git a/test/cli/minions_accuracy_test.cpp b/test/cli/minions_accuracy_test.cpp index 15d4b15..1efc4dc 100644 --- a/test/cli/minions_accuracy_test.cpp +++ b/test/cli/minions_accuracy_test.cpp @@ -78,13 +78,37 @@ TEST_F(cli_test, sequence_file) EXPECT_EQ(result.err, std::string{}); } +TEST_F(cli_test, hybridstrobemer) +{ + cli_test_result result = execute_app("minions accuracy --method strobemer --hybrid -k 8 --w-min 0 --w-max 16 --order 2 --ibfsize 10000 ", data("example1.fasta"), "--search-file", data("search.fasta"), "--solution-file", data("expected_search_result.out")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, minstrobemer) +{ + cli_test_result result = execute_app("minions accuracy --method strobemer --min -k 8 --w-min 0 --w-max 16 --order 2 --ibfsize 10000 ", data("example1.fasta"), "--search-file", data("search.fasta"), "--solution-file", data("expected_search_result.out")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, randstrobemer) +{ + cli_test_result result = execute_app("minions accuracy --method strobemer --rand -k 8 --w-min 0 --w-max 16 --order 2 --ibfsize 10000 ", data("example1.fasta"), "--search-file", data("search.fasta"), "--solution-file", data("expected_search_result.out")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + TEST_F(cli_test, wrong_method) { cli_test_result result = execute_app("minions accuracy --method submer -k 19 --search-file ", data("search.fasta"), data("example.ibf")); std::string expected { "Error. Incorrect command line input for accuracy. Validation failed " - "for option --method: Value submer is not one of [kmer,minimiser,modmer,syncmer].\n" + "for option --method: Value submer is not one of [kmer,minimiser,modmer,syncmer,strobemer].\n" }; EXPECT_EQ(result.exit_code, 0); From b72f0f5a34e1ba12fbcce364f39458b5dbd9cdc3 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 5 May 2023 23:12:33 +0200 Subject: [PATCH 20/34] Accuracy Workflow --- .gitignore | 2 + src/snakemake/accuracy/Snakefile | 185 ++++++++++++++++++ src/snakemake/accuracy/plot_match.py | 83 ++++++++ .../accuracy/plot_match_representative.py | 70 +++++++ 4 files changed, 340 insertions(+) create mode 100644 src/snakemake/accuracy/Snakefile create mode 100644 src/snakemake/accuracy/plot_match.py create mode 100644 src/snakemake/accuracy/plot_match_representative.py diff --git a/.gitignore b/.gitignore index 4b24751..fc45346 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,8 @@ src/snakemake/ !src/snakemake/accuracy/add_errors.py !src/snakemake/accuracy/README !src/snakemake/accuracy/Snakefile +!src/snakemake/accuracy/plot_match.py +!src/snakemake/accuracy/plot_match_representative.py !src/snakemake/distance/Snakefile diff --git a/src/snakemake/accuracy/Snakefile b/src/snakemake/accuracy/Snakefile new file mode 100644 index 0000000..0186590 --- /dev/null +++ b/src/snakemake/accuracy/Snakefile @@ -0,0 +1,185 @@ +rule all: + input: + # Match + [str(shape)+"_minimiser_hash_16_16_match_"+str(error)+".out" for shape in ["0", "36607", "51755"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_20_20_match_"+str(error)+".out" for shape in ["0", "933855", "975475"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_24_24_match_"+str(error)+".out" for shape in ["0", "14548847", "13954519"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_28_28_match_"+str(error)+".out" for shape in ["0", "234879855", "241004285"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_32_32_match_"+str(error)+".out" for shape in ["0", "3169577727", "3856068575"] for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + # Representative + ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4) for error in [1,2,5,10]], + ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["syncmer_hash_20_"+str(w)+"_0_6_match_"+str(error)+".out" for w in [15,11,7,3,1] for error in [1,2,5,10]], + # Representative based on strobemers + ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + # Accuracy + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # Representative + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]] +# ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in range(16,36,4) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # Representative +# ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in range(24,44,4) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3,5,7,9,11] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18,16,14,12,10] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + # [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15,11,7,3,1] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + +rule download_example_Data: + output: + "../results/simulated_reads100000.fa" + shell: + """wget "https://ftp.imp.fu-berlin.de/pub/darvish/simulated_reads100000.fa.fa.gz" -O ../results/simulated_reads100000.fa.gz + gunzip ../results/simulated_reads100000.fa.gz""" + +rule add_error: + input: + "../results/simulated_reads100000.fa" + output: + "simulated_reads100000_{error}.fa" + shell: + "python3 add_errors.py ../results/simulated_reads100000.fa simulated_reads100000_{wildcards.error}.fa {wildcards.error}" + +rule match_minimiser_modmer: + input: + "../results/simulated_reads100000.fa", + "simulated_reads100000_{error}.fa" + output: + "{shape}_{method}_hash_{kmer_size}_{w_size}_match_{error}.out" + wildcard_constraints: + shape='[0-9]*', + method='(modmer|minimiser)' + shell: + "minions match --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape {wildcards.shape} -o {wildcards.shape}_ {input} > {wildcards.shape}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_match_{wildcards.error}.out" + +rule match_syncmer: + input: + "../results/simulated_reads100000.fa", + "simulated_reads100000_{error}.fa" + output: + "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_match_{error}.out", + shell: + "minions match --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input} > syncmer_hash_{wildcards.kmer_size}_{wildcards.w_size}_{wildcards.pos_begin}_{wildcards.pos_end}_match_{wildcards.error}.out" + +rule match_strobemer: + input: + "../results/simulated_reads100000.fa", + "simulated_reads100000_{error}.fa" + output: + "{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}_match_{error}.out" + wildcard_constraints: + method='(min|rand|hybrid)', + kmer_size='[0-9]*', + order='(2|3)', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions match --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input} > {wildcards.method}strobemers_{wildcards.kmer_size}_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_match_{wildcards.error}.out" + +rule match_minimiser_modmer_strobemer: + input: + "../results/simulated_reads100000.fa", + "simulated_reads100000_{error}.fa" + output: + "{method2}strobemers_{order}_{wmin}_{wmax}_{method}_hash_{kmer_size}_{w_size}_match_{error}.out" + wildcard_constraints: + method='(modmer|minimiser)', + method2='(min|rand|hybrid)', + order='(2|3)', + kmer_size='[0-9]*', + w_size='[0-9]*', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions match --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} {input} > {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_match_{wildcards.error}.out" + +rule match_minimiser_modmer_accuracy_ibf: + input: + ["64/bins/bin_"+"%02d" % (i,) + ".fasta" for i in range(64)] + output: + "{shape}_{error}_{threshold}_{method}_hash_{kmer_size}_{w_size}.ibf" + wildcard_constraints: + shape='[0-9]*', + method='(modmer|minimiser)' + shell: + "minions accuracy --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --threshold {wildcards.threshold} --shape {wildcards.shape} -o {wildcards.shape}_{wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + +rule match_minimiser_modmer_accuracy: + input: + "{shape}_{error}_{threshold}_{method}_hash_{kmer_size}_{w_size}.ibf" + output: + "{shape}_{error}_{threshold}_{method}_hash_{kmer_size}_{w_size}_all_accuracy.out" + wildcard_constraints: + shape='[0-9]*', + method='(modmer|minimiser)' + shell: + "minions accuracy --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --threshold {wildcards.threshold} --shape {wildcards.shape} -o {wildcards.shape}_{wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + +rule match_strobemer_accuracy_ibf: + input: + ["64/bins/bin_"+"%02d" % (i,) + ".fasta" for i in range(64)] + output: + "{error}_{threshold}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}.ibf" + wildcard_constraints: + method='(min|rand|hybrid)', + kmer_size='[0-9]*', + order='(2|3)', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions accuracy --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} --threshold {wildcards.threshold} --shape 0 -o {wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + +rule match_strobemer_accuracy: + input: + "{error}_{threshold}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}.ibf" + output: + "{error}_{threshold}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}_all_accuracy.out" + wildcard_constraints: + method='(min|rand|hybrid)', + kmer_size='[0-9]*', + order='(2|3)', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions accuracy --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} --threshold {wildcards.threshold} --shape 0 -o {wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + +rule match_syncmer_accuracy_ibf: + input: + ["64/bins/bin_"+"%02d" % (i,) + ".fasta" for i in range(64)] + output: + "{error}_{threshold}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}.ibf" + shell: + "minions accuracy --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --threshold {wildcards.threshold} --shape 0 -o {wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + +rule match_syncmer_accuracy: + input: + "{error}_{threshold}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}.ibf" + output: + "{error}_{threshold}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_all_accuracy.out" + shell: + "minions accuracy --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --threshold {wildcards.threshold} --shape 0 -o {wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" diff --git a/src/snakemake/accuracy/plot_match.py b/src/snakemake/accuracy/plot_match.py new file mode 100644 index 0000000..09bd86b --- /dev/null +++ b/src/snakemake/accuracy/plot_match.py @@ -0,0 +1,83 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +#k_size = [16,20,24,28,32] +#pos = [x+0.25 for x in range(len(k_size))] +#strobe_range = [int(k/2) for k in k_size] +k_size = [16,20,24,28,32] +pos = [x+0.25 for x in range(len(k_size))] +pos_order3 = [1.25,4.25,7.25] +k_order3 = [9,12,15] +k_size_order3 = [i*2 for i in k_order3] +strobe_range = [k for k in range(8,17,2)] + +def read_file(results, files): + cov = 0.0 + for file in files: + with open(file, 'r') as f: + for line in f: + if (line[:7]=="Match C"): + cov = round(float(line.split()[2]),2) + if (line[:7]=="Islands"): + mean = round(float(line.split('\t')[2]),2) + stdev = round(float(line.split('\t')[3]),2) + results.append((mean,stdev,cov)) + return results + +# Read all files for an error +for error in [1,2,5,10]: + kmers = read_file([], ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_match_"+str(error)+".out" for k in range(16,36,4)]) + shapes4 = ["36607","933855","14548847","234879855","3169577727"] + gapped4_kmers = read_file([], [shapes4[i] + "_minimiser_hash_"+str(k_size[i])+"_"+str(k_size[i])+"_match_"+str(error)+".out" for i in range(len(k_size))]) + shapes8 = ["51755","975475","13954519","241004285","3856068575"] + gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size[i])+"_"+str(k_size[i])+"_match_"+str(error)+".out" for i in range(len(k_size))]) + + minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in strobe_range]) + hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in strobe_range]) + randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in strobe_range]) + minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in strobe_range]) + hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in strobe_range]) + randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in strobe_range]) + + + # Plot comparison between all Island size + fig = plt.figure() + X = np.arange(len(k_size)) + + colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + plt.xlabel("k") + plt.xticks(pos, k_size) + plt.ylabel("Average island size") + + plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos, [x[2] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) + + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.savefig("../results/Match_island_"+str(error)+".png",bbox_inches='tight') + + # Plot comparison between all match coverage + fig = plt.figure() + X = np.arange(len(k_size)) + + colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + plt.xlabel("k") + plt.xticks(pos, k_size) + plt.ylabel("Match coverage") + + plt.plot(pos, [x[2] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos, [x[2] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos, [x[2] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos, [x[2] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos, [x[2] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos, [x[2] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) + + + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.savefig("../results/Match_cov_"+str(error)+".png",bbox_inches='tight') diff --git a/src/snakemake/accuracy/plot_match_representative.py b/src/snakemake/accuracy/plot_match_representative.py new file mode 100644 index 0000000..7d1bdde --- /dev/null +++ b/src/snakemake/accuracy/plot_match_representative.py @@ -0,0 +1,70 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +#k_size = [16,20,24,28,32] +#pos = [x+0.25 for x in range(len(k_size))] +#strobe_range = [int(k/2) for k in k_size] +k_size = [16,20,24,28,32] +pos = [x+0.25 for x in range(len(k_size))] +pos_order3 = [1.25,4.25,7.25] +k_order3 = [9,12,15] +k_size_order3 = [i*2 for i in k_order3] +strobe_range = [k for k in range(8,17,2)] + +def read_file(results, files): + cov = 0.0 + for file in files: + with open(file, 'r') as f: + for line in f: + if (line[:7]=="Match C"): + cov = round(float(line.split()[2]),2) + if (line[:7]=="Islands"): + mean = round(float(line.split('\t')[2]),2) + stdev = round(float(line.split('\t')[3]),2) + results.append((mean,stdev,cov)) + return results + +# Read all files for an error +for error in [1,2,5,10]: + minimiser = read_file([], ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4)]) + modmer = read_file([], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_match_"+str(error)+".out" for w in [15,11,7,3,1]]) + + # Plot comparison between all Island size + fig = plt.figure() + X = np.arange(len(k_size)) + + colors = ["#01d63a","#00e7e0","#fefea1","#748beb"] + plt.xlabel("k") + plt.xticks(pos, k_size) + print(opensyncmer)#plt.ylabel("Average island size") + + plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) + plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.savefig("../results/Match_island_representative"+str(error)+".png",bbox_inches='tight') + + # Plot comparison between all match coverage + fig = plt.figure() + X = np.arange(len(k_size)) + + colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + plt.xlabel("k") + plt.xticks(pos, k_size) + plt.ylabel("Match coverage") + + plt.plot(pos, [x[2] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) + plt.plot(pos, [x[2] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) + plt.plot(pos, [x[2] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[2] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + + + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.savefig("../results/Match_cov_representative"+str(error)+".png",bbox_inches='tight') From 239bed0de3e6e206085687763e8da96437156f57 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Tue, 30 May 2023 16:22:33 +0200 Subject: [PATCH 21/34] Update. --- src/snakemake/accuracy/Snakefile | 104 ++++++++++++------ src/snakemake/accuracy/plot_accuracy.py | 76 +++++++++++++ src/snakemake/accuracy/plot_match.py | 2 +- .../accuracy/plot_match_representative.py | 60 ++++++++-- src/snakemake/speed/Snakefile | 22 ++-- src/snakemake/speed/plot_speed.py | 82 ++++++++------ .../speed/plot_speed_representative.py | 77 +++++++++++++ 7 files changed, 340 insertions(+), 83 deletions(-) create mode 100644 src/snakemake/accuracy/plot_accuracy.py create mode 100644 src/snakemake/speed/plot_speed_representative.py diff --git a/src/snakemake/accuracy/Snakefile b/src/snakemake/accuracy/Snakefile index 0186590..e5db1c0 100644 --- a/src/snakemake/accuracy/Snakefile +++ b/src/snakemake/accuracy/Snakefile @@ -2,16 +2,30 @@ rule all: input: # Match [str(shape)+"_minimiser_hash_16_16_match_"+str(error)+".out" for shape in ["0", "36607", "51755"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_18_18_match_"+str(error)+".out" for shape in ["0", "233469", "246365"] for error in [1,2,5,10]], [str(shape)+"_minimiser_hash_20_20_match_"+str(error)+".out" for shape in ["0", "933855", "975475"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_22_22_match_"+str(error)+".out" for shape in ["0", "4192891", "3669089"] for error in [1,2,5,10]], [str(shape)+"_minimiser_hash_24_24_match_"+str(error)+".out" for shape in ["0", "14548847", "13954519"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_26_26_match_"+str(error)+".out" for shape in ["0", "62257151", "66560815"] for error in [1,2,5,10]], [str(shape)+"_minimiser_hash_28_28_match_"+str(error)+".out" for shape in ["0", "234879855", "241004285"] for error in [1,2,5,10]], + [str(shape)+"_minimiser_hash_30_30_match_"+str(error)+".out" for shape in ["0", "805287931", "1004529051"] for error in [1,2,5,10]], [str(shape)+"_minimiser_hash_32_32_match_"+str(error)+".out" for shape in ["0", "3169577727", "3856068575"] for error in [1,2,5,10]], + ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_match_"+str(error)+".out" for k in range(16,34,2) for error in [1,2,5,10]], + [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+4)+ "_"+str([k for k in range(16,34,2)][i]+4)+ "_match_"+str(error)+".out" for i in range(9) for error in [1,2,5,10]], + [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+8)+"_"+str([k for k in range(16,34,2)][i]+8)+"_match_"+str(error)+".out" for i in range(9) for error in [1,2,5,10]], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + # 8 "gaps" + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], # Representative ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4) for error in [1,2,5,10]], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11] for error in [1,2,5,10]], @@ -24,31 +38,39 @@ rule all: ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], # Accuracy - ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [20] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], # Representative - ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]] -# ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in range(16,36,4) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in range(8,17) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # Representative -# ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in range(24,44,4) for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3,5,7,9,11] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18,16,14,12,10] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - # [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15,11,7,3,1] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], +# ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]] rule download_example_Data: output: @@ -118,20 +140,38 @@ rule match_minimiser_modmer_strobemer: shell: "minions match --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} {input} > {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_match_{wildcards.error}.out" +rule match_syncmer_strobemer: + input: + "../results/simulated_reads100000.fa", + "simulated_reads100000_{error}.fa" + output: + "{method2}strobemers_{order}_{wmin}_{wmax}_{method}_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_match_{error}.out" + wildcard_constraints: + method='(syncmer)', + method2='(min|rand|hybrid)', + order='(2|3)', + kmer_size='[0-9]*', + w_size='[0-9]*', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions match --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} -p {wildcards.pos_begin} -p {wildcards.pos_end} {input} > {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_{wildcards.pos_begin}_{wildcards.pos_end}_match_{wildcards.error}.out" + + rule match_minimiser_modmer_accuracy_ibf: input: ["64/bins/bin_"+"%02d" % (i,) + ".fasta" for i in range(64)] output: - "{shape}_{error}_{threshold}_{method}_hash_{kmer_size}_{w_size}.ibf" + "{shape}_{error}_{method}_hash_{kmer_size}_{w_size}.ibf" wildcard_constraints: shape='[0-9]*', method='(modmer|minimiser)' shell: - "minions accuracy --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --threshold {wildcards.threshold} --shape {wildcards.shape} -o {wildcards.shape}_{wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + "minions accuracy --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --threshold 0.1 --shape {wildcards.shape} -o {wildcards.shape}_{wildcards.error}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" rule match_minimiser_modmer_accuracy: input: - "{shape}_{error}_{threshold}_{method}_hash_{kmer_size}_{w_size}.ibf" + "{shape}_{error}_{method}_hash_{kmer_size}_{w_size}.ibf" output: "{shape}_{error}_{threshold}_{method}_hash_{kmer_size}_{w_size}_all_accuracy.out" wildcard_constraints: @@ -144,7 +184,7 @@ rule match_strobemer_accuracy_ibf: input: ["64/bins/bin_"+"%02d" % (i,) + ".fasta" for i in range(64)] output: - "{error}_{threshold}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}.ibf" + "{error}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}.ibf" wildcard_constraints: method='(min|rand|hybrid)', kmer_size='[0-9]*', @@ -152,11 +192,11 @@ rule match_strobemer_accuracy_ibf: wmin='[0-9]*', wmax='[0-9]*' shell: - "minions accuracy --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} --threshold {wildcards.threshold} --shape 0 -o {wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + "minions accuracy --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} --threshold 0.1 --shape 0 -o {wildcards.error}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" rule match_strobemer_accuracy: input: - "{error}_{threshold}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}.ibf" + "{error}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}.ibf" output: "{error}_{threshold}_{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}_all_accuracy.out" wildcard_constraints: @@ -172,13 +212,13 @@ rule match_syncmer_accuracy_ibf: input: ["64/bins/bin_"+"%02d" % (i,) + ".fasta" for i in range(64)] output: - "{error}_{threshold}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}.ibf" + "{error}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}.ibf" shell: - "minions accuracy --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --threshold {wildcards.threshold} --shape 0 -o {wildcards.error}_{wildcards.threshold}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" + "minions accuracy --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --threshold 0.1 --shape 0 -o {wildcards.error}_ --ibfsize 536870912 --search-file 64/reads_e{wildcards.error}_100/all.fastq --solution-file search_results.out {input}" rule match_syncmer_accuracy: input: - "{error}_{threshold}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}.ibf" + "{error}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}.ibf" output: "{error}_{threshold}_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_all_accuracy.out" shell: diff --git a/src/snakemake/accuracy/plot_accuracy.py b/src/snakemake/accuracy/plot_accuracy.py new file mode 100644 index 0000000..4975502 --- /dev/null +++ b/src/snakemake/accuracy/plot_accuracy.py @@ -0,0 +1,76 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7] +pos = [x+0.25 for x in range(len(thresholds))] +strobe_range = [10] + +def read_file(results, files): + num_fp = 0 + fn_0 = 0.0 + i = 0 + for file in files: + with open(file, 'r') as f: + for line in f: + tp = int(line.split()[1]) + tn = int(line.split()[2]) + fp = int(line.split()[3]) + fn = int(line.split()[4]) + if (fn == 0): + fn_0 = thresholds[i] + num_fp = fp + i += 1 + results.append([num_fp, fn_0]) + +# Read all files for an error +for error in [2,5]: + results = [] + read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, ["933855_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, ["975475_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + + read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + + print("Error: ",error, "\n", results) + +def fix(): + fig = plt.figure() + labels = ['k-mer','4 k-mer','8 k-mer', 'minstrobemers','hybridstrobemers','randstrobemers'] + + colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + #plt.xlabel("Threshold") + plt.xticks(pos, thresholds) + plt.ylabel("# False Positives") + y_pos = np.arange(len(labels)) + + plt.bar(y_pos, [x[0] for x in results[:6]], align='center') + + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.savefig("../results/FPR_"+str(error)+".png",bbox_inches='tight') + + # Plot comparison between all match coverage + fig = plt.figure() + X = np.arange(len(thresholds)) + + colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + plt.xlabel("Threshold") + plt.xticks(pos, thresholds) + plt.ylabel("False Negative Rate") + + plt.plot(pos, [x[1] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos, [x[1] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos, [x[1] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos, [x[1] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos, [x[1] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos, [x[1] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) + + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.savefig("../results/FNR_"+str(error)+".png",bbox_inches='tight') diff --git a/src/snakemake/accuracy/plot_match.py b/src/snakemake/accuracy/plot_match.py index 09bd86b..a262d71 100644 --- a/src/snakemake/accuracy/plot_match.py +++ b/src/snakemake/accuracy/plot_match.py @@ -57,7 +57,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) - plt.plot(pos, [x[2] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) + plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Match_island_"+str(error)+".png",bbox_inches='tight') diff --git a/src/snakemake/accuracy/plot_match_representative.py b/src/snakemake/accuracy/plot_match_representative.py index 7d1bdde..586a260 100644 --- a/src/snakemake/accuracy/plot_match_representative.py +++ b/src/snakemake/accuracy/plot_match_representative.py @@ -27,13 +27,7 @@ def read_file(results, files): results.append((mean,stdev,cov)) return results -# Read all files for an error -for error in [1,2,5,10]: - minimiser = read_file([], ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4)]) - modmer = read_file([], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_match_"+str(error)+".out" for w in [15,11,7,3,1]]) - +def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2): # Plot comparison between all Island size fig = plt.figure() X = np.arange(len(k_size)) @@ -41,7 +35,7 @@ def read_file(results, files): colors = ["#01d63a","#00e7e0","#fefea1","#748beb"] plt.xlabel("k") plt.xticks(pos, k_size) - print(opensyncmer)#plt.ylabel("Average island size") + plt.ylabel("Average island size") plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) @@ -49,7 +43,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") - plt.savefig("../results/Match_island_representative"+str(error)+".png",bbox_inches='tight') + plt.savefig(outfile1,bbox_inches='tight') # Plot comparison between all match coverage fig = plt.figure() @@ -67,4 +61,50 @@ def read_file(results, files): plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") - plt.savefig("../results/Match_cov_representative"+str(error)+".png",bbox_inches='tight') + plt.savefig(outfile2,bbox_inches='tight') + + +# Read all files for an error +for error in [1,2,5,10]: + minimiser = read_file([], ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4)]) + modmer = read_file([], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_match_"+str(error)+".out" for w in [15,11,7,3,1]]) + + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative"+str(error)+".png","../results/Match_cov_representative"+str(error)+".png") + + minimiser = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_hybrid1_"+str(error)+".png","../results/Match_cov_representative_hybrid1_"+str(error)+".png") + + minimiser = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_min1_"+str(error)+".png","../results/Match_cov_representative_min1_"+str(error)+".png") + + minimiser = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_rand1_"+str(error)+".png","../results/Match_cov_representative_rand1_"+str(error)+".png") + + minimiser = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_hybrid_"+str(error)+".png","../results/Match_cov_representative_hybrid_"+str(error)+".png") + + minimiser = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_min_"+str(error)+".png","../results/Match_cov_representative_min_"+str(error)+".png") + + minimiser = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_rand_"+str(error)+".png","../results/Match_cov_representative_rand_"+str(error)+".png") diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index 23b0166..c9282a3 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -7,7 +7,7 @@ rule all: "../results/Speed_minstrobemers_original.png", "../results/Speed_hybridstrobemers_original.png", "../results/Speed_randstrobemers_original.png", - "../results/Speed_randstrobemers_original_order3.png" + "../results/Speed_randstrobemers_original_order3.png", "../results/Speed_representative.png" rule plot: @@ -31,30 +31,34 @@ rule plot: [shape + "_kmer_hash_28_speed.out" for shape in ["0", "234879855", "241004285"]], [shape + "_kmer_hash_30_speed.out" for shape in ["0", "805287931", "1004529051"]], [shape + "_kmer_hash_32_speed.out" for shape in ["0", "3169577727", "3856068575"]], + # Shapes k + 4/8 + ["0_kmer_hash_"+str(k)+"_speed.out" for k in range(16,34,2)], + [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_kmer_hash_"+str([k for k in range(16,34,2)][i]+4)+"_speed.out" for i in range(9)], + [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_kmer_hash_"+str([k for k in range(16,34,2)][i]+8)+"_speed.out" for i in range(9)], # 4 "gaps" ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [6,8,10]], ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [9,12,15]], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [6,8,10]], # 8 "gaps" ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [6,8,10]], ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [6,8,10]], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [9,12,15]], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [6,8,10]], # 4 "gaps" ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], - ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [9,12,15]], + ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], # 8 "gaps" ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], - ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [9,12,15]] + ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [6,8,10]] shell: "python3 plot_speed.py" rule plot_representative: diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 8081cab..51e089c 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -10,8 +10,8 @@ k_size = [16,18,20,22,24,26,28,30,32] pos = [x+0.25 for x in range(len(k_size))] pos_order3 = [1.25,4.25,7.25] -k_order3 = [9,12,15] -k_size_order3 = [i*2 for i in k_order3] +k_order3 = [6,8,10] +k_size_order3 = [i*3 for i in k_order3] strobe_range = [k for k in range(8,17)] def read_file(results, files): @@ -25,15 +25,19 @@ def read_file(results, files): # Read all files kmers = read_file([], ["0_kmer_hash_"+str(k)+"_speed.out" for k in k_size]) -shapes4 = ["36607","233469","933855","4192891","14548847","62257151","234879855","805287931","3169577727"] -gapped4_kmers = read_file([], [shapes4[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) -shapes8 = ["51755","246365","975475","3669089","13954519","66560815","241004285","1004529051","3856068575"] -gapped8_kmers = read_file([], [shapes8[i] + "_kmer_hash_"+str(k_size[i])+"_speed.out" for i in range(len(k_size))]) +#shapes4 = ["36607","233469","933855","4192891","14548847","62257151","234879855","805287931","3169577727"] +shapes4=['777695', '2621175', '16252901', '50196477', '251620351', '905838335', '4286578095', '13958643693', '66035113981'] +gapped4_kmers = read_file([], [shapes4[i] + "_kmer_hash_"+str(k_size[i]+4)+"_speed.out" for i in range(len(k_size))]) +#shapes8 = ["51755","246365","975475","3669089","13954519","66560815","241004285","1004529051","3856068575"] +shapes8 = ['14021527', '45607667', '180082591', '1068161519', '3522001919', '13957854679', '64423783901', '205814423455', '1094946651927'] +gapped8_kmers = read_file([], [shapes8[i] + "_kmer_hash_"+str(k_size[i]+8)+"_speed.out" for i in range(len(k_size))]) kmers_order3 = read_file([], ["0_kmer_hash_"+str(k)+"_speed.out" for k in k_size_order3]) -shapes4_order3 = ["233469","14548847","805287931"] +#shapes4_order3 = ["233469","14548847","805287931"] +shapes4_order3 = ['2621175', '251620351', '13958643693'] gapped4_order3 = read_file([], [shapes4_order3[i] + "_kmer_hash_"+str(k_size_order3[i])+"_speed.out" for i in range(len(k_order3))]) -shapes8_order3 = ["246365","13954519","1004529051"] +#shapes8_order3 = ["246365","13954519","1004529051"] +shapes8_order3 = ['45607667', '3522001919', '205814423455'] gapped8_order3 = read_file([], [shapes8_order3[i] + "_kmer_hash_"+str(k_size_order3[i])+"_speed.out" for i in range(len(k_order3))]) minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in strobe_range]) @@ -48,12 +52,16 @@ def read_file(results, files): hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) +original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) # Plot comparison between all fig = plt.figure() X = np.arange(len(k_size)) colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] +colors = ["#004c6d","#009dbe","#00f6ff","#fee6ce","#fdae6b","#e6550d"] +colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds @@ -64,6 +72,8 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) +#plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors[5], label='randstrobemers', linewidth=3.0) +#plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors[1], label='8 ori') plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all.png",bbox_inches='tight') @@ -72,7 +82,7 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size)) -colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] +#colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds @@ -84,6 +94,8 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='hybridstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='randstrobemers',linewidth=3.0) +#plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors[5], label='ranstrobemers', linewidth=3.0) + plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all8.png",bbox_inches='tight') @@ -91,18 +103,20 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size_order3)) -colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] +#colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos_order3, k_size_order3) plt.ylabel("Speed in microseconds") # in microseconds +original_randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in k_order3]) + plt.plot(pos_order3, [x[0] for x in kmers_order3], color = colors[0], label='k-mer', linewidth=3.0) plt.plot(pos_order3, [x[0] for x in gapped4_order3], color = colors[1], label='4 k-mer',linewidth=3.0) plt.plot(pos_order3, [x[0] for x in gapped8_order3], color = colors[2], label='8 k-mer',linewidth=3.0) plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[3], label='minstrobemers',linewidth=3.0) plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[4], label='hybridstrobemers',linewidth=3.0) -plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[5], label='randstrobemers',linewidth=3.0) - +#plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[5], label='randstrobemers',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in original_randstrobemers3], color = colors[5], label='randstrobemers',linewidth=3.0) #plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all_order3.png",bbox_inches='tight') @@ -110,17 +124,20 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size_order3)) -colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] +#colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos_order3, k_size_order3) plt.ylabel("Speed in microseconds") # in microseconds +original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) + plt.plot(pos_order3, [x[0] for x in kmers_order3], color = colors[0], label='k-mer', linewidth=3.0) plt.plot(pos_order3, [x[0] for x in gapped4_order3], color = colors[1], label='4 k-mer',linewidth=3.0) plt.plot(pos_order3, [x[0] for x in gapped8_order3], color = colors[2], label='8 k-mer',linewidth=3.0) plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='minstrobemers',linewidth=3.0) plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='hybridstrobemers',linewidth=3.0) -plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='randstrobemers',linewidth=3.0) +#plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='randstrobemers',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors[5], label='randstrobemers',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all8_order3.png",bbox_inches='tight') @@ -138,7 +155,7 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size)) -colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] +#colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] colors_error = ["#748beb","#e47585","#b261c2","#01d63a","#00e7e0","#fefea1"] plt.xlabel("k") plt.xticks(pos, k_size) @@ -166,11 +183,14 @@ def read_file(results, files): plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds +colors_ori = ["#bae4bc","#7bccc4","#43a2ca","#0868ac"] +colors_ori = ["#e66101","#fdb863","#b2abd2","#5e3c99"] +colors_ori = ["#dfc27d","#a6611a","#80cdc1","#018571"] -plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[2], label='4') -plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[5], label='8') -plt.plot(pos, [x[0] for x in original_minstrobemers2], color = colors[0], label='4 ori') -plt.plot(pos, [x[0] for x in original_minstrobemers28], color = colors[1], label='8 ori') +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors_ori[0], label='4', linewidth=3.0) +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors_ori[1], label='8',linewidth=3.0) +plt.plot(pos, [x[0] for x in original_minstrobemers2], color = colors_ori[2], label='4 ori',linewidth=3.0) +plt.plot(pos, [x[0] for x in original_minstrobemers28], color = colors_ori[3], label='8 ori',linewidth=3.0) #plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[2], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[5], alpha=0.7) @@ -185,10 +205,10 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[2], label='4') -plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[5], label='8') -plt.plot(pos, [x[0] for x in original_hybridstrobemers2], color = colors[0], label='4 ori') -plt.plot(pos, [x[0] for x in original_hybridstrobemers28], color = colors[1], label='8 ori') +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors_ori[0], label='4',linewidth=3.0) +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors_ori[1], label='8',linewidth=3.0) +plt.plot(pos, [x[0] for x in original_hybridstrobemers2], color = colors_ori[2], label='4 ori',linewidth=3.0) +plt.plot(pos, [x[0] for x in original_hybridstrobemers28], color = colors_ori[3], label='8 ori',linewidth=3.0) #plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[2], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[5], alpha=0.7) @@ -203,10 +223,10 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='4') -plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='8') -plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors[0], label='4 ori') -plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors[1], label='8 ori') +plt.plot(pos, [x[0] for x in randstrobemers2], color = colors_ori[0], label='4',linewidth=3.0) +plt.plot(pos, [x[0] for x in randstrobemers28], color = colors_ori[1], label='8',linewidth=3.0) +plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors_ori[2], label='4 ori',linewidth=3.0) +plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors_ori[3], label='8 ori',linewidth=3.0) #plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[5], alpha=0.7) @@ -221,10 +241,10 @@ def read_file(results, files): plt.xticks(pos_order3, k_order3) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='4') -plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='8') -plt.plot(pos_order3, [x[0] for x in original_randstrobemers3], color = colors[0], label='4 ori') -plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors[1], label='8 ori') +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors_ori[0], label='4',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors_ori[1], label='8',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in original_randstrobemers3], color = colors_ori[2], label='4 ori',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors_ori[3], label='8 ori',linewidth=3.0) #plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) #plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) diff --git a/src/snakemake/speed/plot_speed_representative.py b/src/snakemake/speed/plot_speed_representative.py new file mode 100644 index 0000000..8937340 --- /dev/null +++ b/src/snakemake/speed/plot_speed_representative.py @@ -0,0 +1,77 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +def read_file(results, files): + for file in files: + with open(file, 'r') as f: + for line in f: + mean = round(float(line.split('\t')[2]),2) + stdev = round(float(line.split('\t')[3]),2) + results.append((mean,stdev)) + return results + +minimiser = read_file([], ["0_minimiser_hash_20_"+str(w)+"_speed.out" for w in [i for i in range(24,44,4)]]) +minimiser_setw = read_file([], ["0_minimiser_hash_"+str(k)+"_40_speed.out" for k in [i for i in range(16,36,4)]]) +# modmer +modmer = read_file([], ["0_modmer_hash_20_"+str(w)+"_speed.out" for w in [3,5,7,9,11]]) +modmer_setw = read_file([], ["0_modmer_hash_"+str(k)+"_7_speed.out" for k in [i for i in range(16,36,4)]]) +# syncmer +opensyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_0_speed.out" for w in [18,16,14,12,10]]) +opensyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_10_0_0_speed.out" for k in [i for i in range(22,12,-2)]]) +closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_speed.out" for w in [15,11,7,3,1]]) +closedsyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_3_0_6_speed.out" for k in [i for i in range(28,8,-4)]]) + + +# Plot comparison between k-mers +k_size = [i for i in range(5)] +pos = [x+0.25 for x in range(len(k_size))] + +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] +colors_error = ["#01d63a","#00e7e0","#fefea1","#748beb"] +plt.xlabel("w,m or s") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) +plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer', linewidth=3.0) +plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer', linewidth=3.0) +plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + +#plt.fill_between(pos, [x[0]-x[1] for x in minimiser], [x[0]+x[1] for x in minimiser], color = colors_error[0], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in modmer], [x[0]+x[1] for x in modmer], color = colors_error[1], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in opensyncmer], [x[0]+x[1] for x in opensyncmer], color = colors_error[2], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in closedsyncmer], [x[0]+x[1] for x in closedsyncmer], color = colors_error[3], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75),title="Methods") +plt.savefig("../results/Speed_representative.png", bbox_inches ='tight') + + +# Plot comparison between k-mers +k_size = [i for i in range(16,36,4)] +pos = [x+0.25 for x in range(len(k_size))] + +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] +colors_error = ["#01d63a","#00e7e0","#fefea1","#748beb"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in minimiser_setw], color = colors[0], label='(40,k)-minimizer',linewidth=3.0) +plt.plot(pos, [x[0] for x in modmer_setw], color = colors[1], label='(k,7)-modmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in opensyncmer_setw], color = colors[2], label='(k,10,[0],1)-syncmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in closedsyncmer_setw], color = colors[3], label='(k,3,[0,6],1)-syncmer',linewidth=3.0) + +#plt.fill_between(pos, [x[0]-x[1] for x in minimiser_setw], [x[0]+x[1] for x in minimiser_setw], color = colors_error[0], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in modmer_setw], [x[0]+x[1] for x in modmer_setw], color = colors_error[1], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in opensyncmer_setw], [x[0]+x[1] for x in opensyncmer_setw], color = colors_error[2], alpha=0.7) +#plt.fill_between(pos, [x[0]-x[1] for x in closedsyncmer_setw], [x[0]+x[1] for x in closedsyncmer_setw], color = colors_error[3], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75),title="Methods") +plt.savefig("../results/Speed_representative_setw.png", bbox_inches ='tight') From 8f3e68303360e6c5f1ef53c66d2cbe349a00d709 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Tue, 30 May 2023 16:30:09 +0200 Subject: [PATCH 22/34] counts --- .gitignore | 4 + src/snakemake/count/Snakefile | 278 ++++++++++++ src/snakemake/count/plot_counts.py | 271 ++++++++++++ .../count/plot_counts_representative.py | 102 +++++ .../count/plot_counts_representative2.py | 400 ++++++++++++++++++ 5 files changed, 1055 insertions(+) create mode 100644 src/snakemake/count/Snakefile create mode 100644 src/snakemake/count/plot_counts.py create mode 100644 src/snakemake/count/plot_counts_representative.py create mode 100644 src/snakemake/count/plot_counts_representative2.py diff --git a/.gitignore b/.gitignore index fc45346..8c9e5f2 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,10 @@ src/snakemake/ !src/snakemake/accuracy/Snakefile !src/snakemake/accuracy/plot_match.py !src/snakemake/accuracy/plot_match_representative.py +!src/snakemake/count/Snakefile +!src/snakemake/count/plot_counts.py +!src/snakemake/count/plot_counts_representative.py +!src/snakemake/count/plot_counts_representative2.py !src/snakemake/distance/Snakefile diff --git a/src/snakemake/count/Snakefile b/src/snakemake/count/Snakefile new file mode 100644 index 0000000..5cd00db --- /dev/null +++ b/src/snakemake/count/Snakefile @@ -0,0 +1,278 @@ +rule all: + input: + "../results/Count_all.png", + "../results/Count_all8.png", + "../results/Unique.out", + "../results/Count_representative.png", + "../results/Unique_representative.out", + "../results/Count_representative_min1.png", + "../results/Count_representative_min.png", + "../results/Count_representative_hybrid1.png", + "../results/Count_representative_hybrid.png", + "../results/Count_representative_rand1.png", + "../results/Count_representative_rand.png", + "../results/Count_representative_min31.png", + "../results/Count_representative_min3.png", + "../results/Count_representative_hybrid31.png", + "../results/Count_representative_hybrid3.png", + "../results/Count_representative_rand31.png", + "../results/Count_representative_rand3.png", + "../results/Unique_representative2.out", + "../results/Unique_representative.out" + + +rule plot: + input: + "../results/Unique.out", + #[shape + "_minimiser_hash_16_16_counts.out" for shape in ["0", "36607", "51755"]], + #[shape + "_minimiser_hash_18_18_counts.out" for shape in ["0","233469", "246365"]], + #[shape + "_minimiser_hash_20_20_counts.out" for shape in ["0", "933855", "975475"]], + #[shape + "_minimiser_hash_22_22_counts.out" for shape in ["0", "4192891", "3669089"]], + #[shape + "_minimiser_hash_24_24_counts.out" for shape in ["0", "14548847", "13954519"]], + #[shape + "_minimiser_hash_26_26_counts.out" for shape in ["0", "62257151", "66560815"]], + #[shape + "_minimiser_hash_28_28_counts.out" for shape in ["0", "234879855", "241004285"]], + #[shape + "_minimiser_hash_30_30_counts.out" for shape in ["0", "805287931", "1004529051"]], + #[shape + "_minimiser_hash_32_32_counts.out" for shape in ["0", "3169577727", "3856068575"]], + # Shapes k + 4/8 + ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_counts.out" for k in range(16,34,2)], + [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+4)+"_"+str([k for k in range(16,34,2)][i]+4)+"_counts.out" for i in range(9)], + [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+8)+"_"+str([k for k in range(16,34,2)][i]+8)+"_counts.out" for i in range(9)], + # 4 "gaps" + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in [6,8,10]], + # 8 "gaps" + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in [6,8,10]] + output: + "../results/Count_all.png", + "../results/Count_all8.png", + shell: "python3 plot_counts.py" + +rule plot_representative: + output: + "../results/Count_representative.png", + input: + # minimiser + ["Rep_minimiser_hash_20_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]], + ["Rep_minimiser_hash_"+str(k)+"_40_counts.out" for k in [i for i in range(16,36,4)]], + # modmer + ["Rep_modmer_hash_20_"+str(w)+"_counts.out" for w in [3,5,7,9,11]], + ["Rep_modmer_hash_"+str(k)+"_7_counts.out" for k in [i for i in range(16,36,4)]], + # syncmer + ["syncmer_hash_20_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]], + ["syncmer_hash_"+str(k)+"_10_0_0_counts.out" for k in [i for i in range(22,12,-2)]], + ["syncmer_hash_20_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]], + ["syncmer_hash_"+str(k)+"_3_0_6_counts.out" for k in [i for i in range(28,8,-4)]], + shell: "python3 plot_counts_representative.py" + +rule plot_representative2: + output: + "../results/Count_representative_min1.png", + "../results/Count_representative_min.png", + "../results/Count_representative_hybrid1.png", + "../results/Count_representative_hybrid.png", + "../results/Count_representative_rand1.png", + "../results/Count_representative_rand.png", + "../results/Count_representative_min31.png", + "../results/Count_representative_min3.png", + "../results/Count_representative_hybrid31.png", + "../results/Count_representative_hybrid3.png", + "../results/Count_representative_rand31.png", + "../results/Count_representative_rand3.png" + input: + # minimiser based on strobemers + ["Rep2_"+smethod+"_2_0_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + # modmer based on strobemers + ["Rep2_"+smethod+"_2_0_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + # syncmer based on strobemers + [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid"]], + [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["min","rand"]], + [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + # syncmer with two positions based on strobemers + [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid"]], + [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1] for smethod in ["min","rand"]], + [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]] + + shell: "python3 plot_counts_representative2.py" + +rule download_human_genome: + output: + "../results/GRCh38.p13.genome.fa.gz" + shell: + """wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/GRCh38.p13.genome.fa.gz" -O ../results/GRCh38.p13.genome.fa.gz""" + +rule count_kmer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "{shape}_minimiser_hash_{kmer_size}_{kmer_size}_counts.out", + "{shape}_minimiser_hash_{kmer_size}_{kmer_size}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method minimiser -k {wildcards.kmer_size} -w {wildcards.kmer_size} --shape {wildcards.shape} -o {wildcards.shape}_ {input}" + +rule count_minstrobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "minstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_counts.out", + "minstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method strobemer --min -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" + +rule count_hybridstrobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "hybridstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_counts.out", + "hybridstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_GRCh38.p13.genome.fa_counts.out" + shell: + """echo "minions counts --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" + minions counts --method strobemer --hybrid -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}""" + +rule count_randstrobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "randstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_counts.out", + "randstrobemers_{kmer_size}_{order}_{wmin}_{wmax}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method strobemer --rand -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" + +rule unique_kmer: + input: + #[shape + "_minimiser_hash_16_16_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "36607", "51755"]], + #[shape + "_minimiser_hash_18_18_GRCh38.p13.genome.fa_counts.out" for shape in ["0","233469", "246365"]], + #[shape + "_minimiser_hash_20_20_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "933855", "975475"]], + #[shape + "_minimiser_hash_22_22_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "4192891", "3669089"]], + #[shape + "_minimiser_hash_24_24_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "14548847", "13954519"]], + #[shape + "_minimiser_hash_26_26_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "62257151", "66560815"]], + #[shape + "_minimiser_hash_28_28_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "234879855", "241004285"]], + #[shape + "_minimiser_hash_30_30_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "805287931", "1004529051"]], + #[shape + "_minimiser_hash_32_32_GRCh38.p13.genome.fa_counts.out" for shape in ["0", "3169577727", "3856068575"]], + # k + 4/8 + ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(16,34,2)], + [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+4)+"_"+str([k for k in range(16,34,2)][i]+4)+"_GRCh38.p13.genome.fa_counts.out" for i in range(9)], + [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+8)+"_"+str([k for k in range(16,34,2)][i]+8)+"_GRCh38.p13.genome.fa_counts.out" for i in range(9)], + # 4 "gaps" + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + # 8 "gaps" + ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]] + output: + "../results/Unique.out" + shell: + "minions unique -o ../results/Unique.out {input}" + +rule count_minimiser_modmer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "Rep_{method}_hash_{kmer_size}_{w_size}_counts.out", + "Rep_{method}_hash_{kmer_size}_{w_size}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} -o Rep_ {input}" + +rule count_syncmer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_counts.out", + "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input}" + +rule count_minimiser_modmer_strobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "Rep2_{smethod}_{order}_{wmin}_{wmax}_Strobemer_{method}_hash_{kmer_size}_{w_size}_counts.out", + "Rep2_{smethod}_{order}_{wmin}_{wmax}_Strobemer_{method}_hash_{kmer_size}_{w_size}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} -o Rep2_{wildcards.smethod}_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_ {input} --strobemer --{wildcards.smethod} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order}" + +rule count_syncmer_strobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "{smethod}_{order}_{wmin}_{wmax}_Strobemer_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_counts.out", + "{smethod}_{order}_{wmin}_{wmax}_Strobemer_syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_GRCh38.p13.genome.fa_counts.out" + shell: + "minions counts --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input} -o {wildcards.smethod}_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_ --strobemer --{wildcards.smethod} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order}" + +rule unique_representative: + input: + ["Rep_minimiser_hash_20_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)]], + ["Rep_minimiser_hash_"+str(k)+"_40_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(16,36,4)]], + # modmer + ["Rep_modmer_hash_20_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11]], + ["Rep_modmer_hash_"+str(k)+"_7_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(16,36,4)]], + # syncmer + ["syncmer_hash_20_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10]], + ["syncmer_hash_"+str(k)+"_10_0_0_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(22,12,-2)]], + ["syncmer_hash_20_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1]], + ["syncmer_hash_"+str(k)+"_3_0_6_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(28,8,-4)]] + output: + "../results/Unique_representative.out" + shell: + "minions unique -o ../results/Unique_representative.out {input}" + + +rule unique_representative2: + input: +# minimiser based on strobemers + ["Rep2_"+smethod+"_2_0_14_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + # modmer based on strobemers + ["Rep2_"+smethod+"_2_0_14_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + # syncmer based on strobemers + [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid"]], + [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["min","rand"]], + [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + # syncmer with two positions based on strobemers + [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid"]], + [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["min","rand"]], + [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]] + output: + "../results/Unique_representative2.out" + shell: + "minions unique -o ../results/Unique_representative2.out {input}" diff --git a/src/snakemake/count/plot_counts.py b/src/snakemake/count/plot_counts.py new file mode 100644 index 0000000..a0b07f1 --- /dev/null +++ b/src/snakemake/count/plot_counts.py @@ -0,0 +1,271 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +k_size = [16,18,20,22,24,26,28,30,32] +pos = [x+0.25 for x in range(len(k_size))] +pos_order3 = [1.25,4.25,7.25] +k_order3 = [6,8,10] +k_size_order3 = [i*3 for i in k_order3] +strobe_range = [k for k in range(8,17)] + +def read_file(results, files): + for file in files: + with open(file, 'r') as f: + for line in f: + mean = round(float(line.split('\t')[2]),2) + stdev = round(float(line.split('\t')[3]),2) + results.append((mean,stdev)) + return results + +# Read all files +kmers = read_file([], ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_counts.out" for k in k_size]) +#shapes4 = ["36607","233469","933855","4192891","14548847","62257151","234879855","805287931","3169577727"] +shapes4=['777695', '2621175', '16252901', '50196477', '251620351', '905838335', '4286578095', '13958643693', '66035113981'] +gapped4_kmers = read_file([], [shapes4[i] + "_minimiser_hash_"+str(k_size[i]+4)+"_"+str(k_size[i]+4)+"_counts.out" for i in range(len(k_size))]) +#shapes8 = ["51755","246365","975475","3669089","13954519","66560815","241004285","1004529051","3856068575"] +shapes8 = ['14021527', '45607667', '180082591', '1068161519', '3522001919', '13957854679', '64423783901', '205814423455', '1094946651927'] +gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size[i]+8)+"_"+str(k_size[i]+8)+"_counts.out" for i in range(len(k_size))]) + +kmers_order3 = read_file([], ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_counts.out" for k in k_size_order3]) +#shapes4_order3 = ["233469","14548847","805287931"] +shapes4_order3 = ['2621175', '251620351', '13958643693'] +gapped4_order3 = read_file([], [shapes4_order3[i] + "_minimiser_hash_"+str(k_size_order3[i]+4)+"_"+str(k_size_order3[i]+4)+"_counts.out" for i in range(len(k_order3))]) +#shapes8_order3 = ["246365","13954519","1004529051"] +shapes8_order3 = ['45607667', '3522001919', '205814423455'] +gapped8_order3 = read_file([], [shapes8_order3[i] + "_minimiser_hash_"+str(k_size_order3[i]+8)+"_"+str(k_size_order3[i]+8)+"_counts.out" for i in range(len(k_order3))]) + +minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in strobe_range]) +minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in k_order3]) +hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_counts.out" for k in strobe_range]) +hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_counts.out" for k in k_order3]) +randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in strobe_range]) +randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in k_order3]) +minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in strobe_range]) +minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in k_order3]) +hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in strobe_range]) +hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in k_order3]) +randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in strobe_range]) +randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in k_order3]) + +# Plot comparison between all +fig = plt.figure() +X = np.arange(len(k_size)) + +colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("# of Submers") +plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Count_all.png",bbox_inches='tight') + +# Plot comparison between all +fig = plt.figure() +X = np.arange(len(k_size)) + +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("% of Submers") +divs = [0.5*(4**k) for k in k_size] +divs4 = [0.5*(4**(k-4)) for k in k_size] +divs8 = [0.5*(4**(k-8)) for k in k_size] +plt.plot(pos, [kmers[i][0]/divs[i] for i in range(len(k_size))], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos, [gapped4_kmers[i][0]/divs4[i] for i in range(len(k_size))], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos, [gapped8_kmers[i][0]/divs8[i] for i in range(len(k_size))], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos, [minstrobemers2[i][0]/divs[i] for i in range(len(k_size))], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, [hybridstrobemers2[i][0]/divs[i] for i in range(len(k_size))], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, [randstrobemers2[i][0]/divs[i] for i in range(len(k_size))], color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Count_all_percentage.png",bbox_inches='tight') + +# Plot comparison between all with 8 +fig = plt.figure() +X = np.arange(len(k_size)) + +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("# of Submers") + +plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Count_all8.png",bbox_inches='tight') + +# Plot comparison between all +fig = plt.figure() +X = np.arange(len(k_order3)) + +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("# of Submers") +plt.plot(pos_order3, [x[0] for x in kmers_order3], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped4_order3], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped8_order3], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[3], label='4 minstrobemers3',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[4], label='4 hybridstrobemers3',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[5], label='4 randstrobemers3',linewidth=3.0) + +#plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Count_all3.png",bbox_inches='tight') + +# Plot comparison between all +fig = plt.figure() +X = np.arange(len(k_order3)) + +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("# of Submers") +plt.plot(pos_order3, [x[0] for x in kmers_order3], color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped4_order3], color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in gapped8_order3], color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='4 minstrobemers3',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='4 hybridstrobemers3',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='4 randstrobemers3',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Count_all38.png",bbox_inches='tight') + +# Plot Uniqueness +kmers = [] +gapped4 = [] +gapped8 = [] +kmers_order3 = [] +gapped4_order3 = [] +gapped8_order3 = [] +minstrobemers2 = [] +minstrobemers3 = [] +hybridstrobemers2 = [] +hybridstrobemers3 = [] +randstrobemers2 = [] +randstrobemers3 = [] +minstrobemers28 = [] +minstrobemers38 = [] +hybridstrobemers28 = [] +hybridstrobemers38 = [] +randstrobemers28 = [] +randstrobemers38 = [] + +it = 0 +with open("../results/Unique.out", 'r') as f: + for line in f: + number = float(line.split()[1]) + if (it < 27): + mod = it % 3 + if (it < 9): + kmers.append(number) + elif (it < 18): + gapped4.append(number) + elif (it < 27): + gapped8.append(number) + elif (it < 36): + minstrobemers2.append(number) + elif (it < 39): + minstrobemers3.append(number) + elif (it < 48): + hybridstrobemers2.append(number) + elif (it < 51): + hybridstrobemers3.append(number) + elif (it < 60): + randstrobemers2.append(number) + elif (it < 63): + randstrobemers3.append(number) + elif (it < 72): + minstrobemers28.append(number) + elif (it < 75): + minstrobemers38.append(number) + elif (it < 84): + hybridstrobemers28.append(number) + elif (it < 87): + hybridstrobemers38.append(number) + elif (it < 96): + randstrobemers28.append(number) + elif (it < 99): + randstrobemers38.append(number) + it += 1 + +kmers_order3 = [kmers[1],kmers[4],kmers[7]] +gapped4_order3 = [gapped4[1],gapped4[4],gapped4[7]] +gapped8_order3 = [gapped8[1],gapped8[4],gapped8[7]] + +# Plot comparison between all +fig = plt.figure() +X = np.arange(len(k_size)) + +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("% of unique submers") +plt.plot(pos, kmers, color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos, gapped4, color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos, gapped8, color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos, minstrobemers2, color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, hybridstrobemers2, color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, randstrobemers2, color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Unique_all.png",bbox_inches='tight') + +# Plot comparison between all 8 +fig = plt.figure() +X = np.arange(len(k_size)) + +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("% of unique submers") +plt.plot(pos, kmers, color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos, gapped4, color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos, gapped8, color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos, minstrobemers28, color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos, hybridstrobemers28, color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos, randstrobemers28, color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Unique_all8.png",bbox_inches='tight') + +# Plot comparison between all order 3 +fig = plt.figure() +X = np.arange(len(k_order3)) + +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("% of unique submers") +plt.plot(pos_order3, kmers_order3, color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos_order3, gapped4_order3, color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos_order3, gapped8_order3, color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos_order3, minstrobemers3, color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos_order3, hybridstrobemers3, color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos_order3, randstrobemers3, color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Unique_all_order3.png",bbox_inches='tight') + +# Plot comparison between all order 3 8 +fig = plt.figure() +X = np.arange(len(k_order3)) + +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("% of unique submers") +plt.plot(pos_order3, kmers_order3, color = colors[0], label='k-mer', linewidth=3.0) +plt.plot(pos_order3, gapped4_order3, color = colors[1], label='4 k-mer',linewidth=3.0) +plt.plot(pos_order3, gapped8_order3, color = colors[2], label='8 k-mer',linewidth=3.0) +plt.plot(pos_order3, minstrobemers38, color = colors[3], label='minstrobemers',linewidth=3.0) +plt.plot(pos_order3, hybridstrobemers38, color = colors[4], label='hybridstrobemers',linewidth=3.0) +plt.plot(pos_order3, randstrobemers38, color = colors[5], label='randstrobemers',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Unique_all_order38.png",bbox_inches='tight') diff --git a/src/snakemake/count/plot_counts_representative.py b/src/snakemake/count/plot_counts_representative.py new file mode 100644 index 0000000..3f78796 --- /dev/null +++ b/src/snakemake/count/plot_counts_representative.py @@ -0,0 +1,102 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +def read_file(results, files): + for file in files: + with open(file, 'r') as f: + for line in f: + mean = round(float(line.split('\t')[2]),2) + stdev = round(float(line.split('\t')[3]),2) + results.append((mean,stdev)) + return results + +minimiser = read_file([], ["Rep_minimiser_hash_20_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +minimiser_setw = read_file([], ["Rep_minimiser_hash_"+str(k)+"_40_counts.out" for k in [i for i in range(16,36,4)]]) +# modmer +modmer = read_file([], ["Rep_modmer_hash_20_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +modmer_setw = read_file([], ["Rep_modmer_hash_"+str(k)+"_7_counts.out" for k in [i for i in range(16,36,4)]]) +# syncmer +opensyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +opensyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_10_0_0_counts.out" for k in [i for i in range(22,12,-2)]]) +closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +closedsyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_3_0_6_counts.out" for k in [i for i in range(28,8,-4)]]) + + +# Plot comparison between k-mers +k_size = [i for i in range(5)] +pos = [x+0.25 for x in range(len(k_size))] + +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] +plt.xlabel("w,m or s") +plt.xticks(pos, k_size) +plt.ylabel("# of submers") # in microseconds + +plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) +plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75),title="Methods") +plt.savefig("../results/Count_representative.png", bbox_inches='tight') + + +# Plot comparison between k-mers +k_size = [i for i in range(16,36,4)] +pos = [x+0.25 for x in range(len(k_size))] + +fig = plt.figure() +X = np.arange(len(k_size)) +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("# of submers") # in microseconds + +plt.plot(pos, [x[0] for x in minimiser_setw], color = colors[0], label='(40,k)-minimizer',linewidth=3.0) +plt.plot(pos, [x[0] for x in modmer_setw], color = colors[1], label='(k,7)-modmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in opensyncmer_setw], color = colors[2], label='(k,10,[0],1)-syncmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in closedsyncmer_setw], color = colors[3], label='(k,3,[0,6],1)-syncmer',linewidth=3.0) + +plt.legend(bbox_to_anchor=(1.01, 0.75),title="Methods") +plt.savefig("../results/Count_representative_setw.png", bbox_inches='tight') + +# Plot Uniqueness +minimiser = [] +modmer = [] +opensyncmer = [] +closedsyncmer = [] + +it = 0 +with open("../results/Unique_representative.out", 'r') as f: + for line in f: + number = float(line.split()[1]) + if (it < 10): + minimiser.append(number) + elif (it < 20): + modmer.append(number) + elif (it < 30): + opensyncmer.append(number) + elif (it < 40): + closedsyncmer.append(number) + it += 1 +print(modmer) +# Plot comparison between k-mers +k_size = [i for i in range(5)] +pos = [x+0.25 for x in range(len(k_size))] + +fig = plt.figure() +X = np.arange(len(k_size)) +plt.xlabel("w,m or s") +plt.xticks(pos, k_size) +plt.ylabel("% of unique submers") # in microseconds + +plt.plot(pos, minimiser[:5], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) +plt.plot(pos, modmer[:5], color = colors[1], label='(20,m)-modmer', linewidth=3.0) +plt.plot(pos, opensyncmer[:5], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) +plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + +plt.legend(title="Methods") +plt.savefig("../results/Unique_representative.png", bbox_inches='tight') diff --git a/src/snakemake/count/plot_counts_representative2.py b/src/snakemake/count/plot_counts_representative2.py new file mode 100644 index 0000000..76ad554 --- /dev/null +++ b/src/snakemake/count/plot_counts_representative2.py @@ -0,0 +1,400 @@ +import sys + +import numpy as np +import matplotlib.pyplot as plt +import numpy as np + +def read_file(results, files): + for file in files: + with open(file, 'r') as f: + for line in f: + mean = round(float(line.split('\t')[2]),2) + stdev = round(float(line.split('\t')[3]),2) + results.append((mean,stdev)) + return results + + +def create_plot(minimiser, modmer, opensyncmer, closedsyncmer, outfile,number_elem=5): + # Plot comparison between k-mers + k_size = [i for i in range(number_elem)] + pos = [x+0.25 for x in range(len(k_size))] + + fig = plt.figure() + X = np.arange(len(k_size)) + colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] + colors_error = ["#01d63a","#00e7e0","#fefea1","#748beb"] + plt.xlabel("w,m or s") + plt.xticks(pos, k_size) + plt.ylabel("# of submers") + + if (number_elem == 5): + plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) + plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer') + plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + else: + plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,27)-minimizer',linewidth=3.0) + plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(27,m)-modmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(27,s,[0],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(27,s,[0,6],1)-syncmer',linewidth=3.0) + + plt.legend(title="Methods") + plt.savefig(outfile, bbox_inches='tight') + +minimiser = read_file([], ["Rep2_min_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_min_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["min_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["min_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min1.png") + +minimiser = read_file([], ["Rep2_rand_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_rand_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["rand_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["rand_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand1.png") + +minimiser = read_file([], ["Rep2_hybrid_2_0_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_2_0_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["hybrid_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["hybrid_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid1.png") + +minimiser = read_file([], ["Rep2_min_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_min_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["min_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["min_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min.png") + + +minimiser = read_file([], ["Rep2_rand_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_rand_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["rand_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["rand_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand.png") + +minimiser = read_file([], ["Rep2_hybrid_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["hybrid_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["hybrid_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid.png") + +minimiser = read_file([], ["Rep2_min_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_min_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["min_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["min_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min31.png", 4) + +minimiser = read_file([], ["Rep2_rand_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_rand_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["rand_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["rand_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand31.png", 4) + +minimiser = read_file([], ["Rep2_hybrid_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["hybrid_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["hybrid_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid31.png",4) + +minimiser = read_file([], ["Rep2_min_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_min_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["min_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["min_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min3.png", 4) + +minimiser = read_file([], ["Rep2_rand_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_rand_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["rand_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["rand_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand3.png", 4) + +minimiser = read_file([], ["Rep2_hybrid_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["hybrid_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["hybrid_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid3.png",4) + + +# Plot Uniqueness +minimiser_hybrid1 = [] +modmer_hybrid1 = [] +opensyncmer_hybrid1 = [] +closedsyncmer_hybrid1 = [] + +minimiser_hybrid = [] +modmer_hybrid = [] +opensyncmer_hybrid = [] +closedsyncmer_hybrid = [] + +minimiser_hybrid31 = [] +modmer_hybrid31 = [] +opensyncmer_hybrid31 = [] +closedsyncmer_hybrid31 = [] + +minimiser_hybrid3 = [] +modmer_hybrid3 = [] +opensyncmer_hybrid3 = [] +closedsyncmer_hybrid3 = [] + +minimiser_rand1 = [] +modmer_rand1 = [] +opensyncmer_rand1 = [] +closedsyncmer_rand1 = [] + +minimiser_rand = [] +modmer_rand = [] +opensyncmer_rand = [] +closedsyncmer_rand = [] + +minimiser_rand31 = [] +modmer_rand31 = [] +opensyncmer_rand31 = [] +closedsyncmer_rand31 = [] + +minimiser_rand3 = [] +modmer_rand3 = [] +opensyncmer_rand3 = [] +closedsyncmer_rand3 = [] + +minimiser_min1 = [] +modmer_min1 = [] +opensyncmer_min1 = [] +closedsyncmer_min1 = [] + +minimiser_min = [] +modmer_min = [] +opensyncmer_min = [] +closedsyncmer_min = [] + +minimiser_min31 = [] +modmer_min31 = [] +opensyncmer_min31 = [] +closedsyncmer_min31 = [] + +minimiser_min3 = [] +modmer_min3 = [] +opensyncmer_min3 = [] +closedsyncmer_min3 = [] + +it = 0 +with open("../results/Unique_representative2.out", 'r') as f: + for line in f: + number = float(line.split()[1]) + if ("2_0_17_" in line): + if ("hybrid" in line): + if ("minimiser" in line): + minimiser_hybrid.append(number) + elif ("modmer" in line): + modmer_hybrid.append(number) + elif ("_0_0_" in line): + opensyncmer_hybrid.append(number) + else: + closedsyncmer_hybrid.append(number) + elif ("rand" in line): + if ("minimiser" in line): + minimiser_rand.append(number) + elif ("modmer" in line): + modmer_rand.append(number) + elif ("_0_0_" in line): + opensyncmer_rand.append(number) + else: + closedsyncmer_rand.append(number) + elif ("min_" in line): + if ("minimiser" in line): + minimiser_min.append(number) + elif ("modmer" in line): + modmer_min.append(number) + elif ("_0_0_" in line): + opensyncmer_min.append(number) + else: + closedsyncmer_min.append(number) + elif ("3_0_17_" in line): + if ("hybrid" in line): + if ("minimiser" in line): + minimiser_hybrid3.append(number) + elif ("modmer" in line): + modmer_hybrid3.append(number) + elif ("_0_0_" in line): + opensyncmer_hybrid3.append(number) + else: + closedsyncmer_hybrid3.append(number) + elif ("rand" in line): + if ("minimiser" in line): + minimiser_rand3.append(number) + elif ("modmer" in line): + modmer_rand3.append(number) + elif ("_0_0_" in line): + opensyncmer_rand3.append(number) + else: + closedsyncmer_rand3.append(number) + elif ("min_" in line): + if ("minimiser" in line): + minimiser_min3.append(number) + elif ("modmer" in line): + modmer_min3.append(number) + elif ("_0_0_" in line): + opensyncmer_min3.append(number) + else: + closedsyncmer_min3.append(number) + elif ("3_0_13_" in line): + if ("hybrid" in line): + if ("minimiser" in line): + minimiser_hybrid31.append(number) + elif ("modmer" in line): + modmer_hybrid31.append(number) + elif ("_0_0_" in line): + opensyncmer_hybrid31.append(number) + else: + closedsyncmer_hybrid31.append(number) + elif ("rand" in line): + if ("minimiser" in line): + minimiser_rand31.append(number) + elif ("modmer" in line): + modmer_rand31.append(number) + elif ("_0_0_" in line): + opensyncmer_rand31.append(number) + else: + closedsyncmer_rand31.append(number) + elif ("min_" in line): + if ("minimiser" in line): + minimiser_min31.append(number) + elif ("modmer" in line): + modmer_min31.append(number) + elif ("_0_0_" in line): + opensyncmer_min31.append(number) + else: + closedsyncmer_min31.append(number) + else: + if ("hybrid" in line): + if ("minimiser" in line): + minimiser_hybrid1.append(number) + elif ("modmer" in line): + modmer_hybrid1.append(number) + elif ("_0_0_" in line): + opensyncmer_hybrid1.append(number) + else: + closedsyncmer_hybrid1.append(number) + elif ("rand" in line): + if ("minimiser" in line): + minimiser_rand1.append(number) + elif ("modmer" in line): + modmer_rand1.append(number) + elif ("_0_0_" in line): + opensyncmer_rand1.append(number) + else: + closedsyncmer_rand1.append(number) + elif ("min_" in line): + if ("minimiser" in line): + minimiser_min1.append(number) + elif ("modmer" in line): + modmer_min1.append(number) + elif ("_0_0_" in line): + opensyncmer_min1.append(number) + else: + closedsyncmer_min1.append(number) + it += 1 +print(modmer) +# Plot comparison between k-mers + +def plot_unique(minimiser, modmer, opensyncmer, closedsyncmer, outfile, num_elem = 5): + k_size = [i for i in range(num_elem)] + pos = [x+0.25 for x in range(len(k_size))] + + fig = plt.figure() + X = np.arange(len(k_size)) + colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] + colors_error = ["#01d63a","#00e7e0","#fefea1","#748beb"] + plt.xlabel("w,m or s") + plt.xticks(pos, k_size) + plt.ylabel("% of unique submers") # in microseconds + + if (num_elem == 5): + plt.plot(pos, minimiser[:5], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) + plt.plot(pos, modmer[:5], color = colors[1], label='(20,m)-modmer',linewidth=3.0) + plt.plot(pos, opensyncmer[:5], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) + plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + else: + plt.plot(pos, minimiser[:5], color = colors[0], label='(w,27)-minimizer',linewidth=3.0) + plt.plot(pos, modmer[:5], color = colors[1], label='(27,m)-modmer',linewidth=3.0) + plt.plot(pos, opensyncmer[:5], color = colors[2], label='(27,s,[0],1)-syncmer',linewidth=3.0) + plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(27,s,[0,6],1)-syncmer',linewidth=3.0) + + plt.legend(title="Methods") + plt.savefig(outfile, bbox_inches='tight') + +plot_unique(minimiser_min, modmer_min,opensyncmer_min, closedsyncmer_min, "../results/Unique_Representative_min.png") +plot_unique(minimiser_min1, modmer_min1,opensyncmer_min1, closedsyncmer_min1, "../results/Unique_Representative_min1.png") +plot_unique(minimiser_rand, modmer_min,opensyncmer_rand, closedsyncmer_rand, "../results/Unique_Representative_rand.png") +plot_unique(minimiser_rand1, modmer_min1,opensyncmer_rand1, closedsyncmer_rand1, "../results/Unique_Representative_rand1.png") +plot_unique(minimiser_hybrid, modmer_min,opensyncmer_hybrid, closedsyncmer_hybrid, "../results/Unique_Representative_hybrid.png") +plot_unique(minimiser_hybrid1, modmer_min1,opensyncmer_hybrid1, closedsyncmer_hybrid1, "../results/Unique_Representative_hybrid1.png") + +plot_unique(minimiser_min3, modmer_min3,opensyncmer_min3, closedsyncmer_min3, "../results/Unique_Representative_min3.png",4) +plot_unique(minimiser_min31, modmer_min31,opensyncmer_min31, closedsyncmer_min31, "../results/Unique_Representative_min31.png",4) +plot_unique(minimiser_rand3, modmer_min3,opensyncmer_rand3, closedsyncmer_rand3, "../results/Unique_Representative_rand3.png",4) +plot_unique(minimiser_rand31, modmer_min31,opensyncmer_rand31, closedsyncmer_rand31, "../results/Unique_Representative_rand31.png",4) +plot_unique(minimiser_hybrid3, modmer_min3,opensyncmer_hybrid3, closedsyncmer_hybrid3, "../results/Unique_Representative_hybrid3.png",4) +plot_unique(minimiser_hybrid31, modmer_min31,opensyncmer_hybrid31, closedsyncmer_hybrid31, "../results/Unique_Representative_hybrid31.png",4) + + +minimiser = [] +modmer = [] +opensyncmer = [] +closedsyncmer = [] +it = 0 +with open("../results/Unique_representative.out", 'r') as f: + for line in f: + number = float(line.split()[1]) + if (it < 10): + minimiser.append(number) + elif (it < 20): + modmer.append(number) + elif (it < 30): + opensyncmer.append(number) + elif (it < 40): + closedsyncmer.append(number) + it += 1 + +minimiser_all = [np.mean(minimiser),np.mean(minimiser_min1),np.mean(minimiser_hybrid1),np.mean(minimiser_rand1)] +modmer_all = [np.mean(modmer),np.mean(modmer_min1),np.mean(modmer_hybrid1),np.mean(modmer_rand1)] +opensyncmer_all = [np.mean(opensyncmer),np.mean(opensyncmer_min1),np.mean(opensyncmer_hybrid1),np.mean(opensyncmer_rand1)] +closedsyncmer_all = [np.mean(closedsyncmer),np.mean(closedsyncmer_min1),np.mean(closedsyncmer_hybrid1),np.mean(closedsyncmer_rand1)] + +def plot_bar(minimiser_all, modmer_all, opensyncmer_all, closedsyncmer_all, outfile): + X = np.arange(4) + colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] + fig = plt.figure() + ax = fig.add_axes([0,0,1,1]) + ax.bar(X + 0.00, minimiser_all, color = colors[0], label='minimizer', width = 0.2) + ax.bar(X + 0.2, modmer_all, color = colors[1], label='modmer', width = 0.2) + ax.bar(X + 0.4, opensyncmer_all, color = colors[2], label='syncmer', width = 0.2) + ax.bar(X + 0.6, closedsyncmer_all, color = colors[3], label='syncmer', width = 0.2) + ax.set_xticks([0.3,1.3,2.3,3.3]) + ax.set_xticklabels(["k-mer","min", "hybrid", "rand"]) + + plt.legend(title="Methods", bbox_to_anchor=(1.01, 0.65)) + plt.savefig(outfile,bbox_inches='tight') + +minimiser_all = [np.mean(minimiser),np.mean(minimiser_min1),np.mean(minimiser_hybrid1),np.mean(minimiser_rand1)] +modmer_all = [np.mean(modmer),np.mean(modmer_min1),np.mean(modmer_hybrid1),np.mean(modmer_rand1)] +opensyncmer_all = [np.mean(opensyncmer),np.mean(opensyncmer_min1),np.mean(opensyncmer_hybrid1),np.mean(opensyncmer_rand1)] +closedsyncmer_all = [np.mean(closedsyncmer),np.mean(closedsyncmer_min1),np.mean(closedsyncmer_hybrid1),np.mean(closedsyncmer_rand1)] +plot_bar(minimiser_all, modmer_all, opensyncmer_all, closedsyncmer_all, "../results/Unique_representative_all_bar1.png") + +minimiser_all = [np.mean(minimiser),np.mean(minimiser_min),np.mean(minimiser_hybrid),np.mean(minimiser_rand)] +modmer_all = [np.mean(modmer),np.mean(modmer_min),np.mean(modmer_hybrid),np.mean(modmer_rand)] +opensyncmer_all = [np.mean(opensyncmer),np.mean(opensyncmer_min),np.mean(opensyncmer_hybrid),np.mean(opensyncmer_rand)] +closedsyncmer_all = [np.mean(closedsyncmer),np.mean(closedsyncmer_min),np.mean(closedsyncmer_hybrid),np.mean(closedsyncmer_rand)] +plot_bar(minimiser_all, modmer_all, opensyncmer_all, closedsyncmer_all, "../results/Unique_representative_all_bar.png") + +minimiser_all = [np.mean(minimiser),np.mean(minimiser_min3),np.mean(minimiser_hybrid3),np.mean(minimiser_rand3)] +modmer_all = [np.mean(modmer),np.mean(modmer_min3),np.mean(modmer_hybrid3),np.mean(modmer_rand3)] +opensyncmer_all = [np.mean(opensyncmer),np.mean(opensyncmer_min3),np.mean(opensyncmer_hybrid3),np.mean(opensyncmer_rand3)] +closedsyncmer_all = [np.mean(closedsyncmer),np.mean(closedsyncmer_min3),np.mean(closedsyncmer_hybrid3),np.mean(closedsyncmer_rand3)] +plot_bar(minimiser_all, modmer_all, opensyncmer_all, closedsyncmer_all, "../results/Unique_representative_all_bar3.png") + +minimiser_all = [np.mean(minimiser),np.mean(minimiser_min31),np.mean(minimiser_hybrid31),np.mean(minimiser_rand31)] +modmer_all = [np.mean(modmer),np.mean(modmer_min31),np.mean(modmer_hybrid31),np.mean(modmer_rand31)] +opensyncmer_all = [np.mean(opensyncmer),np.mean(opensyncmer_min31),np.mean(opensyncmer_hybrid31),np.mean(opensyncmer_rand31)] +closedsyncmer_all = [np.mean(closedsyncmer),np.mean(closedsyncmer_min31),np.mean(closedsyncmer_hybrid31),np.mean(closedsyncmer_rand31)] +plot_bar(minimiser_all, modmer_all, opensyncmer_all, closedsyncmer_all, "../results/Unique_representative_all_bar31.png") From 71c1b52337509a3e720fea167f712dedaf73c1fc Mon Sep 17 00:00:00 2001 From: MitraDarja Date: Tue, 30 May 2023 16:45:58 +0200 Subject: [PATCH 23/34] Add counts --- README.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b763bca..1378a25 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,21 @@ Run test to check, if Comparison is working as intended. All tests should pass. make test ``` +# Counts + +Counts creates two output files: One named `{method}_{inputfile_name}_counts.out` storing as a binary file all submers and their respective count values and one named `{method}_counts.out` storing the minimium, mean, the variance and maximum of the count values. Count can also handle multiple files and calculate the mean over all sequences found in all files. Counts considers for all supported methods the canonical version. + +Example usage for calculating the counts of k-mers of a given input file `in.fa`: +``` +minions counts --method kmer -k 16 in.fasta +``` +This results in the two files: `kmer_hash_16_in_counts.out` and `kmer_hash_16_counts.out`. + # Speed Speeds creates a file called `{method}_speed.out` and returns the speed of processing a singular sequence in microseconds. As typical one sequence file contains multiple sequences the minimum speed, the mean, the variance and the maximum speed are returned. Speed can also handle multiple files and calculate the mean over all sequences found in all files. Speed considers for all supported methods the non-canonical version. -Example usage for calculating the k-mers of a given input file `in.fa`: +Example usage for calculating the speed of k-mers of a given input file `in.fa`: ``` minions speed --method kmer -k 16 in.fasta ``` @@ -57,6 +67,21 @@ For the original implementation, add the flag `--original` and note that for the `w-max` in the implementation from minions is the window length that should be considered for every strobe besides the first one. All strobes need to be completely inside this window length to be considered. While for the original implementation, it is the position in the sequence until which a strobe that is considered has to start. Therefore, for a strobemer with a strobe length of 8, `w-min` of 0 and `w-max` of 15 in the minion implementation would equal a `w-min` of 9 and `w-max` of 17. For more details, please read the documentation for both implementations. +# Unique + +Unique should be run after counts, as the input should be a `{method}_{inputfile_name}_counts.out` file, which stores the submers with their count values. Unique then calculates the percentage of unique submers for all given files and reports it in a output file. + +Example usage for calculating the uniqueness of k-mers for the output file of the example in counts: +``` +minions unique `kmer_hash_16_in_counts.out` -o Unique.out +``` + +This results in the file `Unique.out`, which looks like: +``` +kmer_hash_16 89.7 +``` +If multiple files would have been given, each file would have added another row. + # Methods If a metric supports a method, pick it with the flag `--method`. From 73d4075167b06207a96cc8584d37007b56da3cfe Mon Sep 17 00:00:00 2001 From: mitradarja Date: Fri, 2 Jun 2023 13:00:57 +0200 Subject: [PATCH 24/34] Update accuracy. --- .gitignore | 1 + src/snakemake/accuracy/Snakefile | 38 ++--- src/snakemake/accuracy/create_res.py | 12 ++ src/snakemake/accuracy/plot_accuracy.py | 17 ++- src/snakemake/accuracy/plot_match.py | 134 +++++++++++++++--- .../accuracy/plot_match_representative.py | 10 +- .../count/plot_counts_representative2.py | 1 + 7 files changed, 168 insertions(+), 45 deletions(-) create mode 100644 src/snakemake/accuracy/create_res.py diff --git a/.gitignore b/.gitignore index 8c9e5f2..2322d5b 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ src/snakemake/ !src/snakemake/accuracy/Snakefile !src/snakemake/accuracy/plot_match.py !src/snakemake/accuracy/plot_match_representative.py +!src/snakemake/accuracy/create_res.py !src/snakemake/count/Snakefile !src/snakemake/count/plot_counts.py !src/snakemake/count/plot_counts_representative.py diff --git a/src/snakemake/accuracy/Snakefile b/src/snakemake/accuracy/Snakefile index e5db1c0..ae398aa 100644 --- a/src/snakemake/accuracy/Snakefile +++ b/src/snakemake/accuracy/Snakefile @@ -19,13 +19,13 @@ rule all: ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,9,12] for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], # Representative ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4) for error in [1,2,5,10]], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11] for error in [1,2,5,10]], @@ -57,20 +57,20 @@ rule all: ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], # Accuracy - ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]], - [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], # Representative - ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], -# ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15] for error in [2,3,4,5] for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7]] + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]] rule download_example_Data: output: diff --git a/src/snakemake/accuracy/create_res.py b/src/snakemake/accuracy/create_res.py new file mode 100644 index 0000000..8c0005d --- /dev/null +++ b/src/snakemake/accuracy/create_res.py @@ -0,0 +1,12 @@ +import math + +exp = 0 +with open("search_results.out",'w') as o: + for i in range(1048576): + o.write(str(i)) + o.write('\t') + o.write(str(math.floor(i/16384))) + o.write('\n') + if ((exp / 16384) == 0): + print(i) + exp=exp+1 diff --git a/src/snakemake/accuracy/plot_accuracy.py b/src/snakemake/accuracy/plot_accuracy.py index 4975502..9d1e3ae 100644 --- a/src/snakemake/accuracy/plot_accuracy.py +++ b/src/snakemake/accuracy/plot_accuracy.py @@ -4,7 +4,7 @@ import matplotlib.pyplot as plt import numpy as np -thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7] +thresholds = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7] pos = [x+0.25 for x in range(len(thresholds))] strobe_range = [10] @@ -26,11 +26,11 @@ def read_file(results, files): results.append([num_fp, fn_0]) # Read all files for an error -for error in [2,5]: +for error in [2,3,4,5]: results = [] read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for threshold in thresholds]) - read_file(results, ["933855_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for threshold in thresholds]) - read_file(results, ["975475_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for threshold in thresholds]) + read_file(results, ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for threshold in thresholds]) read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) @@ -41,6 +41,15 @@ def read_file(results, files): print("Error: ",error, "\n", results) +print("Representative:") +for error in [2,3,4,5]: + results = [] + read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_24_all_accuracy.out" for k in [20] for threshold in thresholds]) + #read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_3_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_syncmer_hash_20_18_0_0"+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_syncmer_hash_20_15_0_6"+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + print("Error: ",error, "\n", results) + def fix(): fig = plt.figure() labels = ['k-mer','4 k-mer','8 k-mer', 'minstrobemers','hybridstrobemers','randstrobemers'] diff --git a/src/snakemake/accuracy/plot_match.py b/src/snakemake/accuracy/plot_match.py index a262d71..bc2b881 100644 --- a/src/snakemake/accuracy/plot_match.py +++ b/src/snakemake/accuracy/plot_match.py @@ -10,8 +10,8 @@ k_size = [16,20,24,28,32] pos = [x+0.25 for x in range(len(k_size))] pos_order3 = [1.25,4.25,7.25] -k_order3 = [9,12,15] -k_size_order3 = [i*2 for i in k_order3] +k_order3 = [6,8,10] +k_size_order3 = [i*3 for i in k_order3] strobe_range = [k for k in range(8,17,2)] def read_file(results, files): @@ -22,18 +22,18 @@ def read_file(results, files): if (line[:7]=="Match C"): cov = round(float(line.split()[2]),2) if (line[:7]=="Islands"): - mean = round(float(line.split('\t')[2]),2) - stdev = round(float(line.split('\t')[3]),2) + mean = round(float(line.split('\t')[1]),2) + stdev = round(float(line.split('\t')[2]),2) results.append((mean,stdev,cov)) return results # Read all files for an error for error in [1,2,5,10]: kmers = read_file([], ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_match_"+str(error)+".out" for k in range(16,36,4)]) - shapes4 = ["36607","933855","14548847","234879855","3169577727"] - gapped4_kmers = read_file([], [shapes4[i] + "_minimiser_hash_"+str(k_size[i])+"_"+str(k_size[i])+"_match_"+str(error)+".out" for i in range(len(k_size))]) - shapes8 = ["51755","975475","13954519","241004285","3856068575"] - gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size[i])+"_"+str(k_size[i])+"_match_"+str(error)+".out" for i in range(len(k_size))]) + shapes4=['777695', '16252901', '251620351', '4286578095', '66035113981'] + gapped4_kmers = read_file([], [shapes4[i] + "_minimiser_hash_"+str(k_size[i]+4)+"_"+str(k_size[i]+4)+"_match_"+str(error)+".out" for i in range(len(k_size))]) + shapes8 = ['14021527', '180082591', '3522001919', '64423783901', '1094946651927'] + gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size[i]+8)+"_"+str(k_size[i]+8)+"_match_"+str(error)+".out" for i in range(len(k_size))]) minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in strobe_range]) hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in strobe_range]) @@ -46,27 +46,39 @@ def read_file(results, files): # Plot comparison between all Island size fig = plt.figure() X = np.arange(len(k_size)) - - colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Average island size") - plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) - - plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.legend(loc = "upper left", title="Methods") plt.savefig("../results/Match_island_"+str(error)+".png",bbox_inches='tight') + # Plot comparison between all Island size 8 + fig = plt.figure() + X = np.arange(len(k_size)) + plt.xlabel("k") + plt.xticks(pos, k_size) + plt.ylabel("Average island size") + plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='randstrobemers',linewidth=3.0) + plt.legend(loc = "upper left", title="Methods") + plt.savefig("../results/Match_island_8_"+str(error)+".png",bbox_inches='tight') + # Plot comparison between all match coverage fig = plt.figure() X = np.arange(len(k_size)) - colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] + colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Match coverage") @@ -77,7 +89,97 @@ def read_file(results, files): plt.plot(pos, [x[2] for x in minstrobemers2], color = colors[3], label='minstrobemers',linewidth=3.0) plt.plot(pos, [x[2] for x in hybridstrobemers2], color = colors[4], label='hybridstrobemers',linewidth=3.0) plt.plot(pos, [x[2] for x in randstrobemers2], color = colors[5], label='randstrobemers',linewidth=3.0) + plt.legend(loc = "upper right", title="Methods") + plt.savefig("../results/Match_cov_"+str(error)+".png",bbox_inches='tight') + fig = plt.figure() + X = np.arange(len(k_size)) + plt.xlabel("k") + plt.xticks(pos, k_size) + plt.ylabel("Match coverage") - plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") - plt.savefig("../results/Match_cov_"+str(error)+".png",bbox_inches='tight') + plt.plot(pos, [x[2] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos, [x[2] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos, [x[2] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos, [x[2] for x in minstrobemers28], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos, [x[2] for x in hybridstrobemers28], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos, [x[2] for x in randstrobemers28], color = colors[5], label='randstrobemers',linewidth=3.0) + plt.legend(loc = "upper right", title="Methods") + plt.savefig("../results/Match_cov_8_"+str(error)+".png",bbox_inches='tight') + + # Plot comparison between all match coverage order 3 + kmers = read_file([], ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_match_"+str(error)+".out" for k in [18,24,30]]) + shapes4=["2621175", "251620351", "13958643693"] + gapped4_kmers = read_file([], [shapes4[i] + "_minimiser_hash_"+str(k_size_order3[i]+4)+"_"+str(k_size_order3[i]+4)+"_match_"+str(error)+".out" for i in range(len(k_size_order3))]) + shapes8 = ["45607667", "3522001919", "205814423455"] + gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size_order3[i]+8)+"_"+str(k_size_order3[i]+8)+"_match_"+str(error)+".out" for i in range(len(k_size_order3))]) + + minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in k_order3]) + hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in k_order3]) + randstrobemers3 = read_file([], ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in k_order3]) + minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in k_order3]) + hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in k_order3]) + randstrobemers38 = read_file([], ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in k_order3]) + + fig = plt.figure() + X = np.arange(len(k_size_order3)) + plt.xlabel("k") + plt.xticks(pos_order3, k_order3) + plt.ylabel("Match coverage") + + plt.plot(pos_order3, [x[2] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in minstrobemers3], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in hybridstrobemers3], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in randstrobemers3], color = colors[5], label='randstrobemers',linewidth=3.0) + + plt.legend(loc = "upper right", title="Methods") + plt.savefig("../results/Match_cov_3_"+str(error)+".png",bbox_inches='tight') + + fig = plt.figure() + X = np.arange(len(k_size_order3)) + plt.xlabel("k") + plt.xticks(pos_order3, k_order3) + plt.ylabel("Match coverage") + + plt.plot(pos_order3, [x[2] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in minstrobemers38], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in hybridstrobemers38], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[2] for x in randstrobemers38], color = colors[5], label='randstrobemers',linewidth=3.0) + + plt.legend(loc = "upper right", title="Methods") + plt.savefig("../results/Match_cov_38_"+str(error)+".png",bbox_inches='tight') + + # Plot comparison between all Island size order 3 + fig = plt.figure() + X = np.arange(len(k_size_order3)) + colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] + plt.xlabel("k") + plt.xticks(pos_order3, k_size_order3) + plt.ylabel("Average island size") + plt.plot(pos_order3, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[5], label='randstrobemers',linewidth=3.0) + plt.legend(loc = "upper left", title="Methods") + plt.savefig("../results/Match_island_3_"+str(error)+".png",bbox_inches='tight') + + fig = plt.figure() + X = np.arange(len(k_size_order3)) + colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] + plt.xlabel("k") + plt.xticks(pos_order3, k_size_order3) + plt.ylabel("Average island size") + plt.plot(pos_order3, [x[0] for x in kmers], color = colors[0], label='k-mer', linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in gapped4_kmers], color = colors[1], label='4 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in gapped8_kmers], color = colors[2], label='8 k-mer',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='minstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='hybridstrobemers',linewidth=3.0) + plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='randstrobemers',linewidth=3.0) + plt.legend(title="Methods", loc = "upper left") + plt.savefig("../results/Match_island_38_"+str(error)+".png",bbox_inches='tight') diff --git a/src/snakemake/accuracy/plot_match_representative.py b/src/snakemake/accuracy/plot_match_representative.py index 586a260..c0d81d1 100644 --- a/src/snakemake/accuracy/plot_match_representative.py +++ b/src/snakemake/accuracy/plot_match_representative.py @@ -22,8 +22,8 @@ def read_file(results, files): if (line[:7]=="Match C"): cov = round(float(line.split()[2]),2) if (line[:7]=="Islands"): - mean = round(float(line.split('\t')[2]),2) - stdev = round(float(line.split('\t')[3]),2) + mean = round(float(line.split('\t')[1]),2) + stdev = round(float(line.split('\t')[2]),2) results.append((mean,stdev,cov)) return results @@ -32,7 +32,7 @@ def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2 fig = plt.figure() X = np.arange(len(k_size)) - colors = ["#01d63a","#00e7e0","#fefea1","#748beb"] + colors = ["#890015","#5cffca","#a13ff0","#ff9ba0"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Average island size") @@ -48,8 +48,6 @@ def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2 # Plot comparison between all match coverage fig = plt.figure() X = np.arange(len(k_size)) - - colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Match coverage") @@ -60,7 +58,7 @@ def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2 plt.plot(pos, [x[2] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) - plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.legend(loc="upper right", title="Methods") plt.savefig(outfile2,bbox_inches='tight') diff --git a/src/snakemake/count/plot_counts_representative2.py b/src/snakemake/count/plot_counts_representative2.py index 76ad554..8119190 100644 --- a/src/snakemake/count/plot_counts_representative2.py +++ b/src/snakemake/count/plot_counts_representative2.py @@ -371,6 +371,7 @@ def plot_bar(minimiser_all, modmer_all, opensyncmer_all, closedsyncmer_all, outf ax.bar(X + 0.6, closedsyncmer_all, color = colors[3], label='syncmer', width = 0.2) ax.set_xticks([0.3,1.3,2.3,3.3]) ax.set_xticklabels(["k-mer","min", "hybrid", "rand"]) + plt.ylabel("% of unique submers") plt.legend(title="Methods", bbox_to_anchor=(1.01, 0.65)) plt.savefig(outfile,bbox_inches='tight') From 4036359792083c686e3fd92d3217711ee656b8bc Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 12 Jun 2023 10:21:23 +0200 Subject: [PATCH 25/34] Correct seed default. --- include/modmer.hpp | 186 ++++++++++++++++++++++++++-------- include/modmer_hash.hpp | 12 +-- include/shared.hpp | 2 - src/main.cpp | 2 +- test/api/modmer_hash_test.cpp | 40 ++++---- test/api/modmer_test.cpp | 18 +--- 6 files changed, 172 insertions(+), 88 deletions(-) diff --git a/include/modmer.hpp b/include/modmer.hpp index 1f2a56f..7f51a7a 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -21,6 +21,8 @@ #include #include +#include "shared.hpp" + namespace seqan3::detail { // --------------------------------------------------------------------------------------------------------------------- @@ -31,15 +33,16 @@ namespace seqan3::detail * \tparam urng1_t The type of the underlying range, must model std::ranges::forward_range, the reference type must * model std::totally_ordered. The typical use case is that the reference type is the result of * seqan3::kmer_hash. - * \tparam measure_distance If true, then not the actual modmers are returned, but the distances of the modmers. + * \tparam urng2_t The type of the underlying range, must model std::ranges::forward_range, the reference type must + * model std::totally_ordered. The typical use case is that the reference type is the result of + * seqan3::kmer_hash. * \implements std::ranges::view * \ingroup search_views * * * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. - */ -template +template > class modmer_view : public std::ranges::view_interface> { private: @@ -47,15 +50,33 @@ class modmer_view : public std::ranges::view_interface> static_assert(std::totally_ordered>, "The reference type of the underlying range must model std::totally_ordered."); + //!\brief The default argument of the second range. + using default_urng2_t = std::ranges::empty_view; + + //!\brief Boolean variable, which is true, when second range is not of empty type. + static constexpr bool second_range_is_given = !std::same_as; + + static_assert(!second_range_is_given + || std::totally_ordered_with, + std::ranges::range_reference_t>, + "The reference types of the underlying ranges must model std::totally_ordered_with."); + //!\brief Whether the given ranges are const_iterable - static constexpr bool const_iterable = seqan3::const_iterable_range; + static constexpr bool const_iterable = + seqan3::const_iterable_range && seqan3::const_iterable_range; //!\brief The first underlying range. urng1_t urange1{}; - //!\brief The number of values in one window. + //!\brief The second underlying range. + urng2_t urange2{}; + + //!\brief The mod value used. size_t mod_used{}; + //!\brief The seed used. + size_t seed_used{}; + template class basic_iterator; @@ -78,11 +99,11 @@ class modmer_view : public std::ranges::view_interface> /*!\brief Construct from a view and a given number of values in one window. * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and * std::ranges::forward_range. - * \param[in] mod_used The number of values in one window. + * \param[in] mod_used The modvalue used. + * \param[in] seed_used The seed used. */ - modmer_view(urng1_t urange1, size_t const mod_used) : - urange1{std::move(urange1)}, - mod_used{mod_used} + explicit modmer_view(urng1_t urange1, size_t const mod_used, uint64_t const seed_used) : + modmer_view{std::move(urange1), default_urng2_t{}, mod_used, seed_used} {} /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. @@ -90,18 +111,72 @@ class modmer_view : public std::ranges::view_interface> from urng1_t. * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and * std::ranges::forward_range. - * \param[in] mod_used The number of values in one window. + * \param[in] mod_used The modvalue used. + * \param[in] seed_used The seed used. */ template //!\cond requires (std::ranges::viewable_range && std::constructible_from>>) //!\endcond - modmer_view(other_urng1_t && urange1, size_t const mod_used) : + modmer_view(other_urng1_t && urange1, size_t const mod_used, uint64_t const seed_used) : urange1{std::views::all(std::forward(urange1))}, - mod_used{mod_used} + urange2{default_urng2_t{}}, + mod_used{mod_used}, + seed_used{seed_used} {} + /*!\brief Construct from a view and a given number of values in one window. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] mod_used The modvalue used. + * \param[in] seed_used The seed used. + */ + modmer_view(urng1_t urange1, urng2_t urange2, size_t const mod_used, uint64_t const seed_used) : + urange1{urange1}, + urange2{urange2}, + mod_used{mod_used}, + seed_used{seed_used} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + + /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \tparam other_urng2_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng2_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] mod_used The modvalue used. + * \param[in] seed_used The seed used. + */ + template + requires (std::ranges::viewable_range + && std::constructible_from> + && std::ranges::viewable_range + && std::constructible_from>) + explicit modmer_view(other_urng1_t && urange1, other_urng2_t && urange2, size_t const mod_used, uint64_t const seed_used) : + urange1{std::views::all(std::forward(urange1))}, + urange2{std::views::all(std::forward(urange2))}, + mod_used{mod_used}, + seed_used{seed_used} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + /*!\name Iterators * \{ */ @@ -121,8 +196,10 @@ class modmer_view : public std::ranges::view_interface> basic_iterator begin() { return {std::ranges::begin(urange1), + std::ranges::begin(urange2), std::ranges::end(urange1), - mod_used}; + mod_used, + seed_used}; } //!\copydoc begin() @@ -132,8 +209,10 @@ class modmer_view : public std::ranges::view_interface> //!\endcond { return {std::ranges::cbegin(urange1), + std::ranges::cbegin(urange2), std::ranges::cend(urange1), - mod_used}; + mod_used, + seed_used}; } /*!\brief Returns an iterator to the element following the last element of the range. @@ -159,15 +238,17 @@ class modmer_view : public std::ranges::view_interface> }; //!\brief Iterator for calculating modmers. -template +template template -class modmer_view::basic_iterator +class modmer_view::basic_iterator { private: //!\brief The sentinel type of the first underlying range. using urng1_sentinel_t = maybe_const_sentinel_t; //!\brief The iterator type of the first underlying range. using urng1_iterator_t = maybe_const_iterator_t; + //!\brief The iterator type of the second underlying range. + using urng2_iterator_t = maybe_const_iterator_t; template friend class basic_iterator; @@ -207,14 +288,17 @@ class modmer_view::basic_iterator //!\endcond : modmer_value{std::move(it.modmer_value)}, urng1_iterator{std::move(it.urng1_iterator)}, + urng2_iterator{std::move(it.urng2_iterator)}, urng1_sentinel{std::move(it.urng1_sentinel)} {} /*!\brief Construct from begin and end iterators of a given range over std::totally_ordered values, and the number of values per window. * \param[in] urng1_iterator Iterator pointing to the first position of the first std::totally_ordered range. + * \param[in] urng2_iterator Iterator pointing to the first position of the second std::totally_ordered range. * \param[in] urng1_sentinel Iterator pointing to the last position of the first std::totally_ordered range. - * \param[in] mod_used The number of values in one window. + * \param[in] mod_used The modvalue used. + * \param[in] mod_used The seed value used. * * \details * @@ -223,11 +307,15 @@ class modmer_view::basic_iterator * once. */ basic_iterator(urng1_iterator_t urng1_iterator, + urng2_iterator_t urng2_iterator, urng1_sentinel_t urng1_sentinel, - size_t mod_used) : - urng1_iterator{std::move(urng1_iterator)}, - urng1_sentinel{std::move(urng1_sentinel)}, - mod{mod_used} + size_t mod_used, + uint64_t seed_used) : + urng1_iterator{urng1_iterator}, + urng2_iterator{urng2_iterator}, + urng1_sentinel{urng1_sentinel}, + mod{mod_used}, + seed{seed_used} { size_t size = std::ranges::distance(urng1_iterator, urng1_sentinel); mod_used = std::min(mod_used, size); @@ -243,7 +331,7 @@ class modmer_view::basic_iterator //!\brief Compare to another basic_iterator. friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) { - return (lhs.urng1_iterator == rhs.urng1_iterator); + return ((lhs.urng1_iterator == rhs.urng1_iterator) && (rhs.urng2_iterator == rhs.urng2_iterator)); } //!\brief Compare to another basic_iterator. @@ -304,19 +392,23 @@ class modmer_view::basic_iterator //!\brief Iterator to the rightmost value of one window. urng1_iterator_t urng1_iterator{}; + //!\brief Iterator to the rightmost value of one window of the second range. + urng2_iterator_t urng2_iterator{}; //!brief Iterator to last element in range. urng1_sentinel_t urng1_sentinel{}; //!brief The mod value used. size_t mod{}; - //!brief The distance stored. Only relevant, if measure_distance is true. - size_t distance{1}; + + //!brief The seed value used. + uint64_t seed{}; //!\brief Advances the window to the next position. void advance() { - distance++; ++urng1_iterator; + if constexpr (second_range_is_given) + ++urng2_iterator; } void first_modmer() @@ -344,31 +436,37 @@ class modmer_view::basic_iterator if (urng1_iterator == urng1_sentinel) return true; - if (*urng1_iterator % mod == 0) + if constexpr (second_range_is_given) { - if constexpr (measure_distance) + if (fnv_hash(std::min(*urng1_iterator, *urng2_iterator), seed) % mod == 0) { - modmer_value = distance - 1; - distance = 0; + modmer_value = std::min(*urng1_iterator, *urng2_iterator); + return true; } - else + + return false; + } + else + { + if (fnv_hash(*urng1_iterator, seed) % mod == 0) { modmer_value = *urng1_iterator; + return true; } - return true; - } - return false; + return false; + } } }; //!\brief A deduction guide for the view class template. template -modmer_view(rng1_t &&, size_t const mod_used) -> modmer_view>; - -template -modmer_view(rng1_t &&, size_t const mod_used) -> modmer_view, m1>; +modmer_view(rng1_t &&, size_t const mod_used, uint64_t seed_used) -> modmer_view>; +//!\brief A deduction guide for the view class template. +template +modmer_view(rng1_t &&, rng2_t &&, size_t const mod_used, uint64_t seed_used) + -> modmer_view, std::views::all_t>; // --------------------------------------------------------------------------------------------------------------------- // modmer_fn (adaptor definition) @@ -380,9 +478,9 @@ modmer_view(rng1_t &&, size_t const mod_used) -> modmer_view - constexpr auto operator()(urng1_t && urange1, size_t const mod_used) const + constexpr auto operator()(urng1_t && urange1, size_t const mod_used, uint64_t const seed_used = 0) const { + static_assert(std::ranges::viewable_range, + "The range parameter to views::modmer cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer must model std::ranges::forward_range."); - return modmer_view{urange1, mod_used}; + return modmer_view{urange1, mod_used, seed_used}; } }; //![adaptor_def] @@ -412,6 +513,7 @@ struct modmer_fn * parameter is omitted in pipe notation] * \param[in] urange1 The range being processed. [parameter is omitted in pipe notation] * \param[in] mod_used The mod value used. + * \param[in] seed_used The seed value used. * \returns A range of std::totally_ordered where each value is ... See below for the * properties of the returned range. * \ingroup search_views diff --git a/include/modmer_hash.hpp b/include/modmer_hash.hpp index 1b34929..7a2a03c 100644 --- a/include/modmer_hash.hpp +++ b/include/modmer_hash.hpp @@ -19,7 +19,6 @@ #include #include "modmer.hpp" -#include "shared.hpp" namespace seqan3::detail { @@ -74,19 +73,14 @@ struct modmer_hash_fn // throw std::invalid_argument{"The chosen mod_used is not valid. " // "Please choose a value greater than 1."}; - auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}); + auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape); auto reverse_strand = std::forward(urange) | seqan3::views::complement | std::views::reverse | seqan3::views::kmer_hash(shape) - | std::views::transform([seed] (uint64_t i) - {return i ^ seed.get();}) | std::views::reverse; - // fnv_hash ensures actual randomness. - auto combined_strand = seqan3::views::zip(forward_strand, reverse_strand) | std::views::transform([seed](std::tuple i){return fnv_hash(std::get<0>(i) + std::get<1>(i), seed.get());}); - return seqan3::detail::modmer_view(combined_strand, mod_used); + + return seqan3::detail::modmer_view(forward_strand, reverse_strand, mod_used, seed.get()); } }; diff --git a/include/shared.hpp b/include/shared.hpp index f0dc473..0e2ff55 100644 --- a/include/shared.hpp +++ b/include/shared.hpp @@ -12,12 +12,10 @@ uint64_t fnv_hash(uint64_t hash_value, uint64_t seed) constexpr static uint64_t default_offset_basis = 0xcbf29ce484222325; constexpr static uint64_t prime = 0x100000001b3; - uint64_t hashed = hash_value; std::ostringstream os; os << hash_value; std::string oss = os.str(); - for (int i = 0; i < oss.size(); i++) { hashed = hashed * prime; diff --git a/src/main.cpp b/src/main.cpp index 1fe7813..f1f0cf0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,7 +7,7 @@ uint32_t w_size; uint64_t shape{}; -uint64_t se; +uint64_t se{0x8F3F73B5CF1C9ADEULL}; void string_to_methods(std::string name, methods & m) { diff --git a/test/api/modmer_hash_test.cpp b/test/api/modmer_hash_test.cpp index 7fc1839..9e906ee 100644 --- a/test/api/modmer_hash_test.cpp +++ b/test/api/modmer_hash_test.cpp @@ -28,15 +28,9 @@ using iterator_type = std::ranges::iterator_t struct iterator_fixture : public ::testing::Test @@ -45,7 +39,7 @@ struct iterator_fixture : public ::testing::Test static constexpr bool const_iterable = false; seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; - result_t expected_range{27+27, 191+1, 252+192, 242+112}; + result_t expected_range{26, 152, 6, 192, 112}; using test_range_t = decltype(text | ungapped_view); test_range_t test_range = text | ungapped_view; @@ -70,7 +64,7 @@ class modmer_hash_test : public ::testing::Test { protected: std::vector text1{"AAAAAA"_dna4}; - result_t result1{}; // Same result for ungapped and gapped + result_t result1{0,0,0}; // Same result for ungapped and gapped std::vector too_short_text{"AC"_dna4}; @@ -78,9 +72,13 @@ class modmer_hash_test : public ::testing::Test // CCGT GCCG CGCC TCGC GTCG CGTC ACGT AACG AAAC TAAA CTAA // ACGG CGGC cgcc GCGA CGAC cgtc ACGT aacg aaac taaa ctaa std::vector text3{"ACGGCGACGTTTAG"_dna4}; - result_t result3_ungapped{27+27, 191+1, 252+192, 242+112}; // ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA - result_t result3_gapped{3+3, 11+1, 12+12, 14+4}; // A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap - result_t result3_ungapped_3{117, 255, 267, 369, 279, 243, 27+27, 117, 191+1, 252+192, 242+112}; + result_t result3_ungapped{26, 152, 6, 192, 112}; // ACGG, GCGA, aacg, taaa, ctaa + result_t result3_gapped{2, 8, 2, 12, 4}; // A--G, G--A, a--g, t--a, c--a - "-" for gap + result_t result3_ungapped_3{105,27,6, 192}; + result_t result3_ungapped_stopped{26, 152}; + result_t result3_gapped_stopped{2, 8}; + result_t result3_ungapped_start{6, 192, 112}; + result_t result3_gapped_start{2, 12, 4}; }; template @@ -101,8 +99,8 @@ TYPED_TEST(modmer_hash_view_properties_test, different_input_ranges) { TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG - result_t ungapped{27+27,216+216, 27+27, 191+1, 252+192, 242+112}; // ACGT/ACGT, TCGA/TCGA, ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA - result_t gapped{3+3, 12+12, 3+3, 11+1, 12+12, 14+4}; // A--T/A--T, T--A/T--A, A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap + result_t ungapped{216, 6, 192, 112}; // TCGA, aacg, taaa, ctaa + result_t gapped{12, 2, 12, 4}; // T--A, a--g, t--a, c--a - "-" for gap EXPECT_RANGE_EQ(ungapped, text | ungapped_view); EXPECT_RANGE_EQ(gapped, text | gapped_view); } @@ -111,7 +109,7 @@ TEST_F(modmer_hash_test, ungapped) { EXPECT_RANGE_EQ(result1, text1 | ungapped_view); EXPECT_TRUE(std::ranges::empty(too_short_text | ungapped_view)); - EXPECT_RANGE_EQ(result3_ungapped, text3 | ungapped_view); + EXPECT_RANGE_EQ(result3_ungapped, (text3 | ungapped_view)); EXPECT_NO_THROW(text1 | modmer_hash(ungapped_shape, 2)); EXPECT_RANGE_EQ(result3_ungapped_3, text3 | ungapped_3_view); } @@ -127,10 +125,10 @@ TEST_F(modmer_hash_test, gapped) TEST_F(modmer_hash_test, combinability) { auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); - EXPECT_RANGE_EQ(result1, text3 | stop_at_t | ungapped_view); - EXPECT_RANGE_EQ(result1, text3 | stop_at_t | gapped_view); + EXPECT_RANGE_EQ(result3_ungapped_stopped, text3 | stop_at_t | ungapped_view); + EXPECT_RANGE_EQ(result3_gapped_stopped, text3 | stop_at_t | gapped_view); auto start_at_a = std::views::drop(6); - EXPECT_RANGE_EQ(result3_ungapped, text3 | start_at_a | ungapped_view); - EXPECT_RANGE_EQ(result3_gapped, text3 | start_at_a | gapped_view); + EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | ungapped_view); + EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_view); } diff --git a/test/api/modmer_test.cpp b/test/api/modmer_test.cpp index 7dbbb6b..b9d2d51 100644 --- a/test/api/modmer_test.cpp +++ b/test/api/modmer_test.cpp @@ -65,7 +65,6 @@ class modmer_test : public ::testing::Test protected: std::vector text1{"AAAAAA"_dna4}; result_t result1{0, 0, 0}; // Same result for ungapped and gapped - result_t result1_distance{0, 0, 0}; std::vector too_short_text{"AC"_dna4}; @@ -75,13 +74,10 @@ class modmer_test : public ::testing::Test std::vector text3{"ACGGCGACGTTTAG"_dna4}; result_t result3_ungapped{26, 166, 152, 134, 252, 242}; // ACGG, GGCG, GCGA, GACG, TTTA, TTAG result_t result3_gapped{2, 10, 8, 10, 12, 14}; // A--G, G--G, G--A, G--G, T--A, T--G "-" for gap - result_t result3_distance{0, 1, 0, 1, 3, 0}; result_t result3_ungapped_stop{26, 166, 152, 134}; // ACGG, GGCG, GCGA, GACG result_t result3_gapped_stop{2, 10, 8, 10}; - result_t result3_distance_stop{0, 1, 0, 1}; result_t result3_ungapped_start{252, 242}; // For start at second A, TTTA, TTAG result_t result3_gapped_start{14}; // For start at second A, T--G - result_t result3_distance_start{3, 0}; }; template @@ -125,9 +121,7 @@ TEST_F(modmer_test, ungapped_kmer_hash) EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | modmer_view); auto v1 = text1 | kmer_view; - EXPECT_RANGE_EQ(result1_distance, (seqan3::detail::modmer_view(v1, 2))); auto v2 = text3 | kmer_view; - EXPECT_RANGE_EQ(result3_distance, (seqan3::detail::modmer_view(v2, 2))); } TEST_F(modmer_test, gapped_kmer_hash) @@ -138,9 +132,7 @@ TEST_F(modmer_test, gapped_kmer_hash) EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | modmer_view); auto v1 = text1 | gapped_kmer_view; - EXPECT_RANGE_EQ(result1_distance, (seqan3::detail::modmer_view(v1, 2))); auto v2 = text3 | gapped_kmer_view; - EXPECT_RANGE_EQ(result3_distance, (seqan3::detail::modmer_view(v2, 2))); } TEST_F(modmer_test, combinability) @@ -151,14 +143,14 @@ TEST_F(modmer_test, combinability) auto v1 = text3 | stop_at_t | kmer_view; auto v2 = text3 | stop_at_t | kmer_view; - EXPECT_RANGE_EQ(result3_distance_stop, (seqan3::detail::modmer_view(v1, 2))); - EXPECT_RANGE_EQ(result3_distance_stop, (seqan3::detail::modmer_view(v2, 2))); auto start_at_a = std::views::drop(6); - EXPECT_RANGE_EQ(result3_ungapped_start, (seqan3::detail::modmer_view{text3 | start_at_a | kmer_view, 2})); + EXPECT_RANGE_EQ(result3_ungapped_start, (seqan3::detail::modmer_view{text3 | start_at_a | kmer_view, 2, 0})); auto v3 = text3 | start_at_a | kmer_view; auto v4 = text3 | start_at_a | gapped_kmer_view; - EXPECT_RANGE_EQ(result3_distance_start, (seqan3::detail::modmer_view(v3, 2))); - EXPECT_RANGE_EQ(result3_distance_start, (seqan3::detail::modmer_view(v4, 2))); + + std::vector ints{7, 182, 3, 9, 5, 216, 134, 252, 3, 242}; + std::vector ints_results{182, 216, 134, 252, 242}; + EXPECT_RANGE_EQ(ints_results, ints | modmer_view); } From 489b05591b7eefcbff8658244f8bd7a957b2346b Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 12 Jun 2023 10:22:19 +0200 Subject: [PATCH 26/34] Correct distance --- src/compare.cpp | 98 +++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/src/compare.cpp b/src/compare.cpp index 87be16d..14d896a 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -14,6 +14,8 @@ #include "randstrobe_hash.hpp" #include "syncmer_hash.hpp" +#include + /*! \brief Returns expected value of given list. * \param results The vector from which mean and variance should be calculated of. */ @@ -175,7 +177,6 @@ void accuracy(urng_t input_view, outfile << "\n"; } outfile.close(); - // Store tp, tn, fp, fn std::ofstream outfile2; outfile2.open(std::string{args.path_out} + method_name + "_" + std::string{args.search_file.stem()} + "_accuracy.out"); @@ -320,19 +321,13 @@ std::vector read_seq_file(std::filesystem::path sequence_file, range_a } template -void distance(std::filesystem::path sequence_file, urng_t input_view, urng_t2 compare_view) +void distance(std::filesystem::path sequence_file, urng_t input_view, urng_t2 compare_view, std::string method_name) { std::vector distances{}; - int distance = 0; - int it_1 = 0; - int it_2 = 0; - - std::vector vector{}; seqan3::sequence_file_input> fin{sequence_file}; for (auto & [seq] : fin) { int distance = 0; - auto representative = seq | input_view; auto rep_it = representative.begin(); auto compare = seq | compare_view; @@ -348,21 +343,24 @@ void distance(std::filesystem::path sequence_file, urng_t input_view, urng_t2 co } rep_it++; } - else - { - distance++; - } + distance++; comp_it++; } } - double mean_distance, stdev_distance; - get_mean_and_var(distances, mean_distance, stdev_distance); - std::cout << "Distances: " << *std::min_element(distances.begin(), distances.end()) << "\t" << mean_distance << "\t" << stdev_distance << "\t" << *std::max_element(distances.begin(), distances.end()) << "\n"; + std::ofstream outfile; + outfile.open(method_name + "_"+ std::string{sequence_file.stem()} + "_distances.out"); + for (int i = *std::min_element(distances.begin(), distances.end()); i <= *std::max_element(distances.begin(), distances.end()); i++) + { + auto count = std::count(distances.begin(), distances.end(), i); + if (count > 0) + outfile << i << "\t" << count << "\n"; + } + outfile.close(); } template -void distance_strobemer(std::filesystem::path sequence_file, urng_t input_view, urng_t2 compare_view) +void distance_strobemer(std::filesystem::path sequence_file, urng_t input_view, urng_t2 compare_view, std::string method_name) { std::vector distances{}; int distance = 0; @@ -400,13 +398,19 @@ void distance_strobemer(std::filesystem::path sequence_file, urng_t input_view, while((rep_it != representative.end()) & (comp_it != compare.end())); } - double mean_distance, stdev_distance; - get_mean_and_var(distances, mean_distance, stdev_distance); - std::cout << "Distances: " << *std::min_element(distances.begin(), distances.end()) << "\t" << mean_distance << "\t" << stdev_distance << "\t" << *std::max_element(distances.begin(), distances.end()) << "\n"; + std::ofstream outfile; + outfile.open(method_name + "_"+ std::string{sequence_file.stem()} + "_distances.out"); + for (int i = *std::min_element(distances.begin(), distances.end()); i <= *std::max_element(distances.begin(), distances.end()); i++) + { + auto count = std::count(distances.begin(), distances.end(), i); + if (count > 0) + outfile << i << "\t" << count << "\n"; + } + outfile.close(); } template -void distance_syncmer(std::filesystem::path sequence_file, urng_t input_view, range_arguments & args) +void distance_syncmer(std::filesystem::path sequence_file, urng_t input_view, range_arguments & args, std::string method_name) { std::vector distances{}; int distance = 0; @@ -439,9 +443,15 @@ void distance_syncmer(std::filesystem::path sequence_file, urng_t input_view, ra while(rep_it != representative.end()); } - double mean_distance, stdev_distance; - get_mean_and_var(distances, mean_distance, stdev_distance); - std::cout << "Distances: " << *std::min_element(distances.begin(), distances.end()) << "\t" << mean_distance << "\t" << stdev_distance << "\t" << *std::max_element(distances.begin(), distances.end()) << "\n"; + std::ofstream outfile; + outfile.open(method_name + "_"+ std::string{sequence_file.stem()} + "_distances.out"); + for (int i = *std::min_element(distances.begin(), distances.end()); i <= *std::max_element(distances.begin(), distances.end()); i++) + { + auto count = std::count(distances.begin(), distances.end(), i); + if (count > 0) + outfile << i << "\t" << count << "\n"; + } + outfile.close(); } void fill_positions(std::vector & positions, int pos, int match_length) @@ -933,47 +943,47 @@ void do_distance(std::filesystem::path sequence_file, range_arguments & args, bo { case minimiser: { if (args.hybrid & (args.order == 2)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.hybrid & (args.order == 3)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 2)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 3)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 2)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 3)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); } break; case modmers: { if (args.hybrid & (args.order == 2)) - distance_strobemer(sequence_file, modmer(args.w_size.get()), hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, modmer(args.w_size.get()), hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.hybrid & (args.order == 3)) - distance_strobemer(sequence_file, modmer(args.w_size.get()), hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, modmer(args.w_size.get()), hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 2)) - distance_strobemer(sequence_file, modmer(args.w_size.get()), minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, modmer(args.w_size.get()), minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 3)) - distance_strobemer(sequence_file, modmer(args.w_size.get()), minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, modmer(args.w_size.get()), minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 2)) - distance_strobemer(sequence_file, modmer(args.w_size.get()),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, modmer(args.w_size.get()),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 3)) - distance_strobemer(sequence_file, modmer(args.w_size.get()), randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se)); + distance_strobemer(sequence_file, modmer(args.w_size.get()), randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); } break; case syncmer: { if (args.hybrid & (args.order == 2)) - distance_syncmer(sequence_file, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + distance_syncmer(sequence_file, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args, std::string{args.path_out} + create_name(args, true)); if (args.hybrid & (args.order == 3)) - distance_syncmer(sequence_file, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + distance_syncmer(sequence_file, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args, std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 2)) - distance_syncmer(sequence_file, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + distance_syncmer(sequence_file, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args, std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 3)) - distance_syncmer(sequence_file, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + distance_syncmer(sequence_file, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args, std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 2)) - distance_syncmer(sequence_file, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + distance_syncmer(sequence_file, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), args, std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 3)) - distance_syncmer(sequence_file, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args); + distance_syncmer(sequence_file, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), args, std::string{args.path_out} + create_name(args, true)); } break; } @@ -983,13 +993,13 @@ void do_distance(std::filesystem::path sequence_file, range_arguments & args, bo switch(args.name) { case minimiser: distance(sequence_file, seqan3::views::minimiser_hash(args.shape, args.w_size, args.seed_se), - seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se)); + seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se), std::string{args.path_out} + create_name(args)); break; case modmers: distance(sequence_file, modmer_hash(args.shape, args.w_size.get(), args.seed_se), - modmer_hash(args.shape, 1, args.seed_se)); + modmer_hash(args.shape, 1, args.seed_se), std::string{args.path_out} + create_name(args)); break; case syncmer: distance(sequence_file, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), - seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se)); + seqan3::views::minimiser_hash(args.shape, seqan3::window_size{args.shape.size()}, args.seed_se), std::string{args.path_out} + create_name(args)); break; } } From c45dc898d814cd086d894b8bfd6c10e3f8b5a9bc Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 12 Jun 2023 12:08:58 +0200 Subject: [PATCH 27/34] Fixes errors. --- include/modmer.hpp | 2 -- test/cli/minions_distance_test.cpp | 8 ++++---- test/cli/minions_match_test.cpp | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/modmer.hpp b/include/modmer.hpp index 7f51a7a..2bfa743 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -495,8 +495,6 @@ struct modmer_fn template constexpr auto operator()(urng1_t && urange1, size_t const mod_used, uint64_t const seed_used = 0) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::modmer cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer must model std::ranges::forward_range."); diff --git a/test/cli/minions_distance_test.cpp b/test/cli/minions_distance_test.cpp index 1d46c44..9a5a39c 100644 --- a/test/cli/minions_distance_test.cpp +++ b/test/cli/minions_distance_test.cpp @@ -18,7 +18,7 @@ TEST_F(cli_test, minimiser) { cli_test_result result = execute_app("minions distance --method minimiser -k 19 -w 19 ", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Distances: 0\t0\t0\t0\n"}); + EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, std::string{}); } @@ -26,7 +26,7 @@ TEST_F(cli_test, gapped_minimiser) { cli_test_result result = execute_app("minions distance --method minimiser -k 19 -w 19 --shape 524223", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Distances: 0\t0\t0\t0\n"}); + EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, std::string{}); } @@ -34,7 +34,7 @@ TEST_F(cli_test, modmer) { cli_test_result result = execute_app("minions distance --method modmer -k 19 -w 2", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Distances: 0\t1.03566\t1.51183\t49\n"}); + EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, std::string{}); } @@ -42,7 +42,7 @@ TEST_F(cli_test, syncmer) { cli_test_result result = execute_app("minions distance --method syncmer -k 19 -w 2 -p 0", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Distances: 0\t3.74393\t4.88286\t40\n"}); + EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, std::string{}); } diff --git a/test/cli/minions_match_test.cpp b/test/cli/minions_match_test.cpp index 7db7e7c..05200ee 100644 --- a/test/cli/minions_match_test.cpp +++ b/test/cli/minions_match_test.cpp @@ -34,7 +34,7 @@ TEST_F(cli_test, modmer) { cli_test_result result = execute_app("minions match --method modmer -k 19 -w 2 ", data("example1.fasta"), data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Matches: 3139105\tMissed: 0\nMatch Coverage: 99.9723\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.out, std::string{"Matches: 3221251\tMissed: 0\nMatch Coverage: 99.9999\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); EXPECT_EQ(result.err, std::string{}); } @@ -42,7 +42,7 @@ TEST_F(cli_test, syncmer) { cli_test_result result = execute_app("minions match --method syncmer -k 19 -w 2 -p 0", data("example1.fasta"), data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Matches: 1305894\tMissed: 0\nMatch Coverage: 97.9846\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.out, std::string{"Matches: 339395\tMissed: 0\nMatch Coverage: 65.7439\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); EXPECT_EQ(result.err, std::string{}); } From e5d54a3e8a87cb18a5d8667d25ef799fce3daacf Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 12 Jun 2023 12:20:46 +0200 Subject: [PATCH 28/34] Revert "Count enables representative methods for strobemers and some clean up." This reverts commit ba3fbc6a99822364a4f3a294e2db8456351b58d7. --- include/compare.h | 6 +- include/modmer.hpp | 2 + include/modmer_hash.hpp | 2 + lib/seqan3 | 2 +- src/compare.cpp | 90 ++++++++++++-- src/main.cpp | 5 +- src/snakemake/genmap/genmap_uniqueness.py | 25 ++-- src/snakemake/speed/Snakefile | 13 +- src/snakemake/speed/plot_speed.py | 139 ++++++++++++++++++++++ 9 files changed, 240 insertions(+), 44 deletions(-) diff --git a/include/compare.h b/include/compare.h index b33b9c6..f7efdc8 100644 --- a/include/compare.h +++ b/include/compare.h @@ -108,9 +108,8 @@ void store_ibf(IBFType const & ibf, /*! \brief Function that creates the string name of the used view. * \param args The arguments about the view to be used. - * \param args If true, "Strobmer" is added to the name. */ -std::string create_name(range_arguments & args, bool underlying_strobemer = false); +std::string create_name(range_arguments & args); /*! \brief Function, comparing the methods in regard of their coverage. * \param args The arguments about the view to be used. @@ -120,9 +119,8 @@ void do_accuracy(accuracy_arguments & args); /*! \brief Function, comparing the number of submers. * \param sequence_files A vector of sequence files. * \param args The arguments about the view to be used. - * \param underlying_strobemer True, if strobemers should be used with a representative method like minimizer. */ -void do_counts(std::vector sequence_files, range_arguments & args, bool underlying_strobemer = false); +void do_counts(std::vector sequence_files, range_arguments & args); /*! \brief Function, comparing the methods in regard of their distance. * \param sequence_file A sequence file. diff --git a/include/modmer.hpp b/include/modmer.hpp index 2bfa743..7f51a7a 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -495,6 +495,8 @@ struct modmer_fn template constexpr auto operator()(urng1_t && urange1, size_t const mod_used, uint64_t const seed_used = 0) const { + static_assert(std::ranges::viewable_range, + "The range parameter to views::modmer cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer must model std::ranges::forward_range."); diff --git a/include/modmer_hash.hpp b/include/modmer_hash.hpp index 7a2a03c..69b86e2 100644 --- a/include/modmer_hash.hpp +++ b/include/modmer_hash.hpp @@ -64,6 +64,8 @@ struct modmer_hash_fn uint32_t const mod_used, seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const { + static_assert(std::ranges::viewable_range, + "The range parameter to views::modmer_hash cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer_hash must model std::ranges::forward_range."); static_assert(semialphabet>, diff --git a/lib/seqan3 b/lib/seqan3 index 9f83975..d29786b 160000 --- a/lib/seqan3 +++ b/lib/seqan3 @@ -1 +1 @@ -Subproject commit 9f83975b4a5dd4e73007b8040e5ba83647e72090 +Subproject commit d29786b61de73f14eed5c83c14ef7e02f038bdb1 diff --git a/src/compare.cpp b/src/compare.cpp index 14d896a..fff0085 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -190,19 +190,46 @@ void accuracy(urng_t input_view, * \param method_name Name of the tested method. * \param args The arguments about the view to be used, needed for strobemers. */ +<<<<<<< HEAD template void counts(std::vector & sequence_files, urng_t input_view, std::string method_name, range_arguments & args) +======= +template +void counts(std::vector sequence_files, urng_t input_view, std::string method_name, range_arguments & args) +>>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) { std::vector counts_results{}; std::ofstream outfile; for (int i = 0; i < sequence_files.size(); ++i) { robin_hood::unordered_node_map hash_table{}; +<<<<<<< HEAD seqan3::sequence_file_input> fin{sequence_files[i]}; for (auto & [seq] : fin) { for (auto && hash : seq | input_view) hash_table[hash] = std::min(65534u, hash_table[hash] + 1); +======= + if constexpr (strobemers > 0) + { + seqan3::sequence_file_input> fin{sequence_files[i]}; + for (auto & [seq] : fin) + { + std::vector> strobes_vector; + get_strobemers(seq, args, strobes_vector); + for (auto & t : strobes_vector) // iterate over the strobemer tuples + hash_table[std::get<0>(t)] = std::min(65534u, hash_table[std::get<0>(t)] + 1); + } + } + else + { + seqan3::sequence_file_input> fin{sequence_files[i]}; + for (auto & [seq] : fin) + { + for (auto && hash : seq | input_view) + hash_table[hash] = std::min(65534u, hash_table[hash] + 1); + } +>>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) } counts_results.push_back(hash_table.size()); @@ -762,21 +789,15 @@ void unique(std::vector input_files, std::filesystem::pat outfile.close(); } -std::string create_name(range_arguments & args, bool underlying_strobemer) +std::string create_name(range_arguments & args) { - std::string prefix{""}; - if (underlying_strobemer) - { - prefix = "Strobemer_"; - } - switch(args.name) { case kmer: return "kmer_hash_"+std::to_string(args.k_size); break; - case minimiser: return prefix +"minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); + case minimiser: return "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); break; - case modmers: return prefix +"modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); + case modmers: return "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); break; case strobemer: { std::ranges::empty_view empty{}; @@ -809,9 +830,10 @@ std::string create_name(range_arguments & args, bool underlying_strobemer) return ""; } } - case syncmer: return prefix + "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); + case syncmer: return "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); default: return ""; + } } @@ -847,10 +869,11 @@ void do_accuracy(accuracy_arguments & args) } } -void do_counts(std::vector sequence_files, range_arguments & args, bool underlying_strobemer) +void do_counts(std::vector sequence_files, range_arguments & args) { - if(underlying_strobemer) + switch(args.name) { +<<<<<<< HEAD switch(args.name) { case minimiser: { @@ -932,6 +955,31 @@ void do_counts(std::vector sequence_files, range_argument counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); } } +======= + case kmer: counts(sequence_files, seqan3::views::kmer_hash(args.shape), create_name(args), args); + break; + case minimiser: counts(sequence_files, seqan3::views::minimiser_hash(args.shape, + args.w_size, args.seed_se), create_name(args), args); + break; + case modmers: counts(sequence_files, modmer_hash(args.shape, + args.w_size.get(), args.seed_se), create_name(args), args); + break; + case syncmer: counts(sequence_files, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), + create_name(args), args); + break; + case strobemer: { + std::ranges::empty_view empty{}; + if (args.rand & (args.order == 2)) + counts, 1>(sequence_files, empty, create_name(args), args); + else if (args.rand & (args.order == 3)) + counts, 2>(sequence_files, empty, create_name(args), args); + else if (args.hybrid) + counts, 3>(sequence_files, empty, create_name(args), args); + else if (args.minstrobers) + counts, 4>(sequence_files, empty, create_name(args), args); + break; + } +>>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) } } @@ -1103,8 +1151,26 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen else if (args.rand & (args.order == 3)) match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); } +<<<<<<< HEAD break; } +======= + else + { + if (args.hybrid & (args.order == 2)) + match(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + else if (args.hybrid & (args.order == 3)) + match(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max),create_name(args), args); + else if (args.minstrobers & (args.order == 2)) + match(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + else if (args.minstrobers & (args.order == 3)) + match(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + else if (args.rand & (args.order == 2)) + match(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + else if (args.rand & (args.order == 3)) + match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); + } +>>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) } } diff --git a/src/main.cpp b/src/main.cpp index f1f0cf0..19db73f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -118,13 +118,10 @@ int counts(seqan3::argument_parser & parser) { range_arguments args{}; std::vector sequence_files{}; - bool underlying_strobemer = false; parser.info.short_description = "Counts the number of submers in the given " "sequence files."; parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); - parser.add_flag(underlying_strobemer,'\0', "strobemer", "If strobemers should be used as base for representative " - "methods like minimizers. Default: False."); all_arguments(parser, args); std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", @@ -146,7 +143,7 @@ int counts(seqan3::argument_parser & parser) } string_to_methods(method, args.name); - do_counts(sequence_files, args, underlying_strobemer); + do_counts(sequence_files, args); return 0; } diff --git a/src/snakemake/genmap/genmap_uniqueness.py b/src/snakemake/genmap/genmap_uniqueness.py index 44feb1d..767dfe1 100644 --- a/src/snakemake/genmap/genmap_uniqueness.py +++ b/src/snakemake/genmap/genmap_uniqueness.py @@ -1,4 +1,3 @@ -import os import sys import numpy as np @@ -8,18 +7,12 @@ k_mers = [16,20,24,28,32] errors = [0,1,2] -def get_unique(in_file,k): +def get_unique(in_file): frequencies = np.fromfile(in_file, dtype=np.uint16) - number_elements = len(frequencies) # Problem counts all elements that appear more than once multiple times + number_elements = len (frequencies) number_unique = (frequencies == 1).sum() - file = "../count/0_minimiser_hash_"+str(k)+"_"+str(k)+"_counts.out" - if (os.path.exists(file)): # This is more accurate - with open(file, 'r') as f: - for line in f: - number_elements = round(float(line.split('\t')[2]),2) - - print(k, number_unique, number_elements, len(frequencies),os.path.exists(file)) + # print(number_elements, number_unique, (number_unique*100.0)/number_elements) return (number_unique*100.0)/number_elements def get_results(species): @@ -28,22 +21,22 @@ def get_results(species): results.append([]) for k in k_mers: for e in errors: - genmap_file = '../../../../genmap/build/genmap/'+species+'_K_'+str(k)+'_E_'+str(e)+'.freq16' - results[errors.index(e)].append(get_unique(genmap_file,k)) + genmap_file = 'genmap/'+species+'_K_'+str(k)+'_E_'+str(e)+'.freq16' + results[errors.index(e)].append(get_unique(genmap_file)) fig = plt.figure() X = np.arange(len(k_mers)) colors = ["#00ba32","#00d6e7","#fad100"] pos = [0.25,1.25,2.25,3.25,4.25] plt.xlabel("k") plt.xticks(pos, k_mers) - plt.ylabel("% of unique k-mers") + plt.ylabel("percentage of unique k-mers") plt.bar(X + 0.00, results[0], color = colors[0], width = 0.25, label='0') plt.bar(X + 0.25, results[1], color = colors[1], width = 0.25, label='1') plt.bar(X + 0.50, results[2], color = colors[2], width = 0.25, label='2') plt.legend(title="# of errors") - plt.savefig("../results/Uniqueness_"+species+".png") + plt.savefig("Uniqueness_"+species+".png") get_results("Human") -#get_results("Mouse") -#get_results("Wheat") +get_results("Mouse") +get_results("Wheat") diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index c9282a3..61ce6cb 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -14,11 +14,10 @@ rule plot: output: "../results/Speed_all.png", "../results/Speed_all8.png", - "../results/Speed_all_order3.png", - "../results/Speed_all8_order3.png", - "../results/Speed_strobemers_original_all.png", - "../results/Speed_minstrobemers_original.png", - "../results/Speed_hybridstrobemers_original.png", + "../results/Speed_kmers.png", + "../results/Speed_strobemers.png", + "../results/Speed_strobemers4.png", + "../results/Speed_strobemers8.png", "../results/Speed_randstrobemers_original.png", "../results/Speed_randstrobemers_original_order3.png" input: @@ -146,7 +145,7 @@ rule speed_minimiser_modmer: output: "0_{method}_hash_{kmer_size}_{w_size}_speed.out" shell: - "minions speed --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} -o 0_ {input}" + "minions speed --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape 0 -o 0_ {input}" rule speed_syncmer: input: @@ -154,4 +153,4 @@ rule speed_syncmer: output: "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_speed.out" shell: - "minions speed --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} {input}" + "minions speed --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input}" diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 51e089c..75a2209 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -55,6 +55,107 @@ def read_file(results, files): original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) +# Plot comparison between k-mers +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#00ba32","#00d6e7","#fad100"] +colors_error = ["#01d63a","#00e7e0","#fefea1"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in kmers], color = colors[0], label='0') +plt.plot(pos, [x[0] for x in gapped4_kmers], color = colors[1], label='4') +plt.plot(pos, [x[0] for x in gapped8_kmers], color = colors[2], label='8') + + +plt.fill_between(pos, [x[0]-x[1] for x in kmers], [x[0]+x[1] for x in kmers], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in gapped4_kmers], [x[0]+x[1] for x in gapped4_kmers], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in gapped8_kmers], [x[0]+x[1] for x in gapped8_kmers], color = colors_error[2], alpha=0.7) + +plt.legend(title="Number of gaps") +plt.savefig("../results/Speed_kmers.png") + +# Plot comparison between strobemers 4 gaps +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1"] +colors_error = ["#748beb","#e47585","#b261c2"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[0], label='minstrobemers') +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[1], label='hybridstrobemers') +plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='randstrobemers') + +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) + +#plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers4.png") + +# Plot comparison between strobemers 8 gaps +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1"] +colors_error = ["#748beb","#e47585","#b261c2"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[0], label='minstrobemers') +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[1], label='hybridstrobemers') +plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[2], label='randstrobemers') + +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[2], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers8.png", bbox_inches='tight') + +# Plot comparison between strobemers 4 gaps order 3 +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1"] +colors_error = ["#748beb","#e47585","#b261c2"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[0], label='minstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[1], label='hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='randstrobemers') + +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers3], [x[0]+x[1] for x in minstrobemers3], color = colors_error[0], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers3], [x[0]+x[1] for x in hybridstrobemers3], color = colors_error[1], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) + +#plt.legend(bbox_to_anchor=(1.25, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers4_order3.png") + +# Plot comparison between strobemers 8 gaps order 3 +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1"] +colors_error = ["#748beb","#e47585","#b261c2"] +plt.xlabel("k") +plt.xticks(pos_order3, k_order3) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[0], label='minstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[1], label='hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[2], label='randstrobemers') + +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers38], [x[0]+x[1] for x in minstrobemers38], color = colors_error[0], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers38], [x[0]+x[1] for x in hybridstrobemers38], color = colors_error[1], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[2], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers8_order3.png", bbox_inches='tight') + # Plot comparison between all fig = plt.figure() X = np.arange(len(k_size)) @@ -142,6 +243,44 @@ def read_file(results, files): plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") plt.savefig("../results/Speed_all8_order3.png",bbox_inches='tight') +# Plot comparison between strobemers all gaps +fig = plt.figure() +X = np.arange(len(k_size)) +colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] +colors_error = ["#748beb","#e47585","#b261c2","#01d63a","#00e7e0","#fefea1"] +plt.xlabel("k") +plt.xticks(pos, k_size) +plt.ylabel("Speed in microseconds") # in microseconds + +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors[0], label='4 minstrobemers') +plt.plot(pos_order3, [x[0] for x in minstrobemers3], color = colors[0], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors[1], label='4 hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers3], color = colors[1], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in randstrobemers2], color = colors[2], label='4 randstrobemers') +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors[2], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors[3], label='8 minstrobemers') +plt.plot(pos_order3, [x[0] for x in minstrobemers38], color = colors[3], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors[4], label='8 hybridstrobemers') +plt.plot(pos_order3, [x[0] for x in hybridstrobemers38], color = colors[4], label='4 randstrobemers3',linestyle="dashed") +plt.plot(pos, [x[0] for x in randstrobemers28], color = colors[5], label='8 randstrobemers') +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors[5], label='8 randstrobemers3',linestyle="dashed") + +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[0], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers3], [x[0]+x[1] for x in minstrobemers3], color = colors_error[0], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[1], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers3], [x[0]+x[1] for x in hybridstrobemers3], color = colors_error[1], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[3], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in minstrobemers38], [x[0]+x[1] for x in minstrobemers38], color = colors_error[3], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[4], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in hybridstrobemers38], [x[0]+x[1] for x in hybridstrobemers38], color = colors_error[4], alpha=0.7) +plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[5], alpha=0.7) +plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) + +plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") +plt.savefig("../results/Speed_strobemers.png", bbox_inches='tight') + # Plot comparison between strobemers all gaps original_minstrobemers2 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) From 88b569e5376952d0b6bf6eec14c13b78ddb9cfae Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 12 Jun 2023 16:13:10 +0200 Subject: [PATCH 29/34] Fix --- include/compare.h | 6 ++- include/minstrobe_hash.hpp | 5 -- include/modmer.hpp | 2 - include/modmer_hash.hpp | 2 - src/compare.cpp | 90 +++++---------------------------- src/main.cpp | 7 ++- test/cli/minions_match_test.cpp | 4 +- 7 files changed, 23 insertions(+), 93 deletions(-) diff --git a/include/compare.h b/include/compare.h index f7efdc8..b33b9c6 100644 --- a/include/compare.h +++ b/include/compare.h @@ -108,8 +108,9 @@ void store_ibf(IBFType const & ibf, /*! \brief Function that creates the string name of the used view. * \param args The arguments about the view to be used. + * \param args If true, "Strobmer" is added to the name. */ -std::string create_name(range_arguments & args); +std::string create_name(range_arguments & args, bool underlying_strobemer = false); /*! \brief Function, comparing the methods in regard of their coverage. * \param args The arguments about the view to be used. @@ -119,8 +120,9 @@ void do_accuracy(accuracy_arguments & args); /*! \brief Function, comparing the number of submers. * \param sequence_files A vector of sequence files. * \param args The arguments about the view to be used. + * \param underlying_strobemer True, if strobemers should be used with a representative method like minimizer. */ -void do_counts(std::vector sequence_files, range_arguments & args); +void do_counts(std::vector sequence_files, range_arguments & args, bool underlying_strobemer = false); /*! \brief Function, comparing the methods in regard of their distance. * \param sequence_file A sequence file. diff --git a/include/minstrobe_hash.hpp b/include/minstrobe_hash.hpp index 57ad986..96de388 100644 --- a/include/minstrobe_hash.hpp +++ b/include/minstrobe_hash.hpp @@ -69,8 +69,6 @@ struct minstrobe2_hash_fn uint32_t const window_len, seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::minstrobe_hash cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::minstrobe_hash must model std::ranges::forward_range."); static_assert(semialphabet>, @@ -95,7 +93,6 @@ struct minstrobe2_hash_fn // Todo: Instead of using vectors, use the std::views::reverse function and zip, but the view reverse is very slow in comparison. auto reverse = seqan3::detail::minstrobe_view(rev_hashed_values, window_min + shape.size() - 1, window_len - shape.size() + 1, shape.count()); - std::vector rev{}; for(auto && h : reverse) rev.push_back(h); @@ -159,8 +156,6 @@ struct minstrobe3_hash_fn uint32_t const window_len, seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::minstrobe_hash cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::minstrobe_hash must model std::ranges::forward_range."); static_assert(semialphabet>, diff --git a/include/modmer.hpp b/include/modmer.hpp index 7f51a7a..2bfa743 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -495,8 +495,6 @@ struct modmer_fn template constexpr auto operator()(urng1_t && urange1, size_t const mod_used, uint64_t const seed_used = 0) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::modmer cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer must model std::ranges::forward_range."); diff --git a/include/modmer_hash.hpp b/include/modmer_hash.hpp index 69b86e2..7a2a03c 100644 --- a/include/modmer_hash.hpp +++ b/include/modmer_hash.hpp @@ -64,8 +64,6 @@ struct modmer_hash_fn uint32_t const mod_used, seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const { - static_assert(std::ranges::viewable_range, - "The range parameter to views::modmer_hash cannot be a temporary of a non-view range."); static_assert(std::ranges::forward_range, "The range parameter to views::modmer_hash must model std::ranges::forward_range."); static_assert(semialphabet>, diff --git a/src/compare.cpp b/src/compare.cpp index fff0085..14d896a 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -190,46 +190,19 @@ void accuracy(urng_t input_view, * \param method_name Name of the tested method. * \param args The arguments about the view to be used, needed for strobemers. */ -<<<<<<< HEAD template void counts(std::vector & sequence_files, urng_t input_view, std::string method_name, range_arguments & args) -======= -template -void counts(std::vector sequence_files, urng_t input_view, std::string method_name, range_arguments & args) ->>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) { std::vector counts_results{}; std::ofstream outfile; for (int i = 0; i < sequence_files.size(); ++i) { robin_hood::unordered_node_map hash_table{}; -<<<<<<< HEAD seqan3::sequence_file_input> fin{sequence_files[i]}; for (auto & [seq] : fin) { for (auto && hash : seq | input_view) hash_table[hash] = std::min(65534u, hash_table[hash] + 1); -======= - if constexpr (strobemers > 0) - { - seqan3::sequence_file_input> fin{sequence_files[i]}; - for (auto & [seq] : fin) - { - std::vector> strobes_vector; - get_strobemers(seq, args, strobes_vector); - for (auto & t : strobes_vector) // iterate over the strobemer tuples - hash_table[std::get<0>(t)] = std::min(65534u, hash_table[std::get<0>(t)] + 1); - } - } - else - { - seqan3::sequence_file_input> fin{sequence_files[i]}; - for (auto & [seq] : fin) - { - for (auto && hash : seq | input_view) - hash_table[hash] = std::min(65534u, hash_table[hash] + 1); - } ->>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) } counts_results.push_back(hash_table.size()); @@ -789,15 +762,21 @@ void unique(std::vector input_files, std::filesystem::pat outfile.close(); } -std::string create_name(range_arguments & args) +std::string create_name(range_arguments & args, bool underlying_strobemer) { + std::string prefix{""}; + if (underlying_strobemer) + { + prefix = "Strobemer_"; + } + switch(args.name) { case kmer: return "kmer_hash_"+std::to_string(args.k_size); break; - case minimiser: return "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); + case minimiser: return prefix +"minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); break; - case modmers: return "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); + case modmers: return prefix +"modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()); break; case strobemer: { std::ranges::empty_view empty{}; @@ -830,10 +809,9 @@ std::string create_name(range_arguments & args) return ""; } } - case syncmer: return "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); + case syncmer: return prefix + "syncmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get())+ "_" + std::to_string(args.positions[0]) + "_" + std::to_string(args.positions[args.positions.size()-1]); default: return ""; - } } @@ -869,11 +847,10 @@ void do_accuracy(accuracy_arguments & args) } } -void do_counts(std::vector sequence_files, range_arguments & args) +void do_counts(std::vector sequence_files, range_arguments & args, bool underlying_strobemer) { - switch(args.name) + if(underlying_strobemer) { -<<<<<<< HEAD switch(args.name) { case minimiser: { @@ -955,31 +932,6 @@ void do_counts(std::vector sequence_files, range_argument counts(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); } } -======= - case kmer: counts(sequence_files, seqan3::views::kmer_hash(args.shape), create_name(args), args); - break; - case minimiser: counts(sequence_files, seqan3::views::minimiser_hash(args.shape, - args.w_size, args.seed_se), create_name(args), args); - break; - case modmers: counts(sequence_files, modmer_hash(args.shape, - args.w_size.get(), args.seed_se), create_name(args), args); - break; - case syncmer: counts(sequence_files, syncmer_hash(args.w_size.get(), args.k_size, args.positions, args.seed_se), - create_name(args), args); - break; - case strobemer: { - std::ranges::empty_view empty{}; - if (args.rand & (args.order == 2)) - counts, 1>(sequence_files, empty, create_name(args), args); - else if (args.rand & (args.order == 3)) - counts, 2>(sequence_files, empty, create_name(args), args); - else if (args.hybrid) - counts, 3>(sequence_files, empty, create_name(args), args); - else if (args.minstrobers) - counts, 4>(sequence_files, empty, create_name(args), args); - break; - } ->>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) } } @@ -1151,26 +1103,8 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen else if (args.rand & (args.order == 3)) match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args), args); } -<<<<<<< HEAD break; } -======= - else - { - if (args.hybrid & (args.order == 2)) - match(sequence_file1, sequence_file2, hybridstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); - else if (args.hybrid & (args.order == 3)) - match(sequence_file1, sequence_file2, hybridstrobe3_hash(args.shape, args.w_min, args.w_max),create_name(args), args); - else if (args.minstrobers & (args.order == 2)) - match(sequence_file1, sequence_file2, minstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); - else if (args.minstrobers & (args.order == 3)) - match(sequence_file1, sequence_file2, minstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); - else if (args.rand & (args.order == 2)) - match(sequence_file1, sequence_file2, randstrobe2_hash(args.shape, args.w_min, args.w_max), create_name(args), args); - else if (args.rand & (args.order == 3)) - match(sequence_file1, sequence_file2, randstrobe3_hash(args.shape, args.w_min, args.w_max), create_name(args), args); - } ->>>>>>> parent of ba3fbc6 (Count enables representative methods for strobemers and some clean up.) } } diff --git a/src/main.cpp b/src/main.cpp index 19db73f..1fe7813 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,7 +7,7 @@ uint32_t w_size; uint64_t shape{}; -uint64_t se{0x8F3F73B5CF1C9ADEULL}; +uint64_t se; void string_to_methods(std::string name, methods & m) { @@ -118,10 +118,13 @@ int counts(seqan3::argument_parser & parser) { range_arguments args{}; std::vector sequence_files{}; + bool underlying_strobemer = false; parser.info.short_description = "Counts the number of submers in the given " "sequence files."; parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); + parser.add_flag(underlying_strobemer,'\0', "strobemer", "If strobemers should be used as base for representative " + "methods like minimizers. Default: False."); all_arguments(parser, args); std::string method{}; parser.add_option(method, '\0', "method", "Pick your method.", @@ -143,7 +146,7 @@ int counts(seqan3::argument_parser & parser) } string_to_methods(method, args.name); - do_counts(sequence_files, args); + do_counts(sequence_files, args, underlying_strobemer); return 0; } diff --git a/test/cli/minions_match_test.cpp b/test/cli/minions_match_test.cpp index 05200ee..27ae451 100644 --- a/test/cli/minions_match_test.cpp +++ b/test/cli/minions_match_test.cpp @@ -34,7 +34,7 @@ TEST_F(cli_test, modmer) { cli_test_result result = execute_app("minions match --method modmer -k 19 -w 2 ", data("example1.fasta"), data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Matches: 3221251\tMissed: 0\nMatch Coverage: 99.9999\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.out, std::string{"Matches: 3853327\tMissed: 0\nMatch Coverage: 99.9981\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); EXPECT_EQ(result.err, std::string{}); } @@ -42,7 +42,7 @@ TEST_F(cli_test, syncmer) { cli_test_result result = execute_app("minions match --method syncmer -k 19 -w 2 -p 0", data("example1.fasta"), data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Matches: 339395\tMissed: 0\nMatch Coverage: 65.7439\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.out, std::string{"Matches: 1305894\tMissed: 0\nMatch Coverage: 97.9846\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); EXPECT_EQ(result.err, std::string{}); } From da6760f688ec60a92dda891b82d77bf8b4613f92 Mon Sep 17 00:00:00 2001 From: mitradarja Date: Mon, 3 Jul 2023 11:41:35 +0200 Subject: [PATCH 30/34] Updated. --- include/modmer.hpp | 2 +- src/compare.cpp | 58 ++++---- src/snakemake/accuracy/Snakefile | 100 ++++++------- src/snakemake/accuracy/plot_accuracy.py | 18 +-- src/snakemake/accuracy/plot_match.py | 24 ++-- .../accuracy/plot_match_representative.py | 77 +++++----- src/snakemake/count/Snakefile | 136 +++++++++--------- src/snakemake/count/plot_counts.py | 24 ++-- .../count/plot_counts_representative.py | 6 +- .../count/plot_counts_representative2.py | 118 +++++++-------- src/snakemake/distance/Snakefile | 70 ++++++--- src/snakemake/genmap/genmap_uniqueness.py | 2 +- src/snakemake/speed/Snakefile | 42 +++--- src/snakemake/speed/plot_speed.py | 83 +++++------ .../speed/plot_speed_representative.py | 8 +- 15 files changed, 402 insertions(+), 366 deletions(-) diff --git a/include/modmer.hpp b/include/modmer.hpp index 2bfa743..662a3f4 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -448,7 +448,7 @@ class modmer_view::basic_iterator } else { - if (fnv_hash(*urng1_iterator, seed) % mod == 0) + if (*urng1_iterator % mod == 0) { modmer_value = *urng1_iterator; return true; diff --git a/src/compare.cpp b/src/compare.cpp index 14d896a..450553a 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -502,9 +502,6 @@ void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_ if (seq1_vector[i] == seq2_vector[i]) { matches++; - if (current_island > 0) - new_island = true; - switch(args.name) { case kmer: fill_positions(positions, i, args.shape.size()); @@ -515,21 +512,24 @@ void match(std::filesystem::path sequence_file1, std::filesystem::path sequence_ else { missed++; + } + } - if (new_island) - { - islands.push_back(current_island); - new_island = false; - current_island = 1; - } - else - { - current_island++; - } + for (int i = 0; i < positions.size(); ++i) + { + if (!positions[i]) + { + current_island++; + } + else if (i > 0) + { + islands.push_back(current_island); + current_island = 0; } } islands.push_back(current_island); + double mean_island, stdev_island; get_mean_and_var(islands, mean_island, stdev_island); @@ -573,38 +573,32 @@ void match_vectors(std::vector & seq1_vector, if ((seq1_vector[it_1] == seq2_vector[it_2]) & changed) { matches++; - changed = false; - if (current_island > 0) - new_island = true; - fill_positions(positions, i, args.k_size); } - else if ((seq1_vector[it_1] == all1_vector[i]) & (seq2_vector[it_2] == all2_vector[i]) & changed) - { - if (new_island) - { - islands.push_back(current_island); - new_island = false; - current_island = 1; - } - else - { - current_island++; - } - } if (seq1_vector[it_1] == all1_vector[i]) { it_1++; - changed = true; } if (seq2_vector[it_2] == all2_vector[i]) { it_2++; - changed = true; } i++; } + + for (int i = 0; i < positions.size(); ++i) + { + if (!positions[i]) + { + current_island++; + } + else if (i > 0) + { + islands.push_back(current_island); + current_island = 0; + } + } islands.push_back(current_island); double mean_island, stdev_island; diff --git a/src/snakemake/accuracy/Snakefile b/src/snakemake/accuracy/Snakefile index ae398aa..cbad62a 100644 --- a/src/snakemake/accuracy/Snakefile +++ b/src/snakemake/accuracy/Snakefile @@ -13,64 +13,64 @@ rule all: ["0_minimiser_hash_"+str(k)+"_"+str(k)+"_match_"+str(error)+".out" for k in range(16,34,2) for error in [1,2,5,10]], [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+4)+ "_"+str([k for k in range(16,34,2)][i]+4)+ "_match_"+str(error)+".out" for i in range(9) for error in [1,2,5,10]], [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+8)+"_"+str([k for k in range(16,34,2)][i]+8)+"_match_"+str(error)+".out" for i in range(9) for error in [1,2,5,10]], - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in range(8,17) for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in [6,8,10] for error in [1,2,5,10]], # Representative ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4) for error in [1,2,5,10]], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11] for error in [1,2,5,10]], ["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["syncmer_hash_20_"+str(w)+"_0_6_match_"+str(error)+".out" for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for w in [15,11,7,3,1] for error in [1,2,5,10]], # Representative based on strobemers - ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1] for error in [1,2,5,10]], # Accuracy - ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9]], - [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [20] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_all_accuracy.out" for k in [10] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], # Representative - ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]], - [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_6"+"_all_accuracy.out" for w in [15] for error in [2,3,4,5] for threshold in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7]] + ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_"+str(w)+"_all_accuracy.out" for w in [24] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_"+str(w)+"_all_accuracy.out" for w in [3] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_0"+"_all_accuracy.out" for w in [18] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]], + [str(error)+"_"+str(threshold)+"_syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_all_accuracy.out" for w in [15] for error in [2,3,4,5] for threshold in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6]] rule download_example_Data: output: diff --git a/src/snakemake/accuracy/plot_accuracy.py b/src/snakemake/accuracy/plot_accuracy.py index 9d1e3ae..64e8377 100644 --- a/src/snakemake/accuracy/plot_accuracy.py +++ b/src/snakemake/accuracy/plot_accuracy.py @@ -5,6 +5,8 @@ import numpy as np thresholds = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7] +thresholds = [round(i,2) for i in np.linspace(0.01, 0.6, num=60)] +print(thresholds) pos = [x+0.25 for x in range(len(thresholds))] strobe_range = [10] @@ -32,12 +34,12 @@ def read_file(results, files): read_file(results, ["16252901_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [24] for threshold in thresholds]) read_file(results, ["180082591_"+str(error)+"_"+str(threshold)+"_minimiser_hash_"+str(k)+"_"+str(k)+"_all_accuracy.out" for k in [28] for threshold in thresholds]) - read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) - read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) - read_file(results, [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) - read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) - read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) - read_file(results, [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results,[str(error)+"_"+str(threshold)+"_hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_all_accuracy.out" for k in strobe_range for threshold in thresholds]) print("Error: ",error, "\n", results) @@ -45,9 +47,9 @@ def read_file(results, files): for error in [2,3,4,5]: results = [] read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_minimiser_hash_20_24_all_accuracy.out" for k in [20] for threshold in thresholds]) - #read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_3_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, ["0_"+str(error)+"_"+str(threshold)+"_modmer_hash_20_3_all_accuracy.out" for k in [20] for threshold in thresholds]) read_file(results, [str(error)+"_"+str(threshold)+"_syncmer_hash_20_18_0_0"+"_all_accuracy.out" for k in [20] for threshold in thresholds]) - read_file(results, [str(error)+"_"+str(threshold)+"_syncmer_hash_20_15_0_6"+"_all_accuracy.out" for k in [20] for threshold in thresholds]) + read_file(results, [str(error)+"_"+str(threshold)+"_syncmer_hash_20_15_0_5"+"_all_accuracy.out" for k in [20] for threshold in thresholds]) print("Error: ",error, "\n", results) def fix(): diff --git a/src/snakemake/accuracy/plot_match.py b/src/snakemake/accuracy/plot_match.py index bc2b881..18cb5ea 100644 --- a/src/snakemake/accuracy/plot_match.py +++ b/src/snakemake/accuracy/plot_match.py @@ -35,12 +35,12 @@ def read_file(results, files): shapes8 = ['14021527', '180082591', '3522001919', '64423783901', '1094946651927'] gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size[i]+8)+"_"+str(k_size[i]+8)+"_match_"+str(error)+".out" for i in range(len(k_size))]) - minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in strobe_range]) - hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in strobe_range]) - randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in strobe_range]) - minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in strobe_range]) - hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in strobe_range]) - randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in strobe_range]) + minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in strobe_range]) + hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in strobe_range]) + randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in strobe_range]) + minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in strobe_range]) + hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in strobe_range]) + randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in strobe_range]) # Plot comparison between all Island size @@ -114,12 +114,12 @@ def read_file(results, files): shapes8 = ["45607667", "3522001919", "205814423455"] gapped8_kmers = read_file([], [shapes8[i] + "_minimiser_hash_"+str(k_size_order3[i]+8)+"_"+str(k_size_order3[i]+8)+"_match_"+str(error)+".out" for i in range(len(k_size_order3))]) - minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in k_order3]) - hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in k_order3]) - randstrobemers3 = read_file([], ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_match_"+str(error)+".out" for k in k_order3]) - minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in k_order3]) - hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in k_order3]) - randstrobemers38 = read_file([], ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_match_"+str(error)+".out" for k in k_order3]) + minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in k_order3]) + hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in k_order3]) + randstrobemers3 = read_file([], ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_match_"+str(error)+".out" for k in k_order3]) + minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in k_order3]) + hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in k_order3]) + randstrobemers38 = read_file([], ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_match_"+str(error)+".out" for k in k_order3]) fig = plt.figure() X = np.arange(len(k_size_order3)) diff --git a/src/snakemake/accuracy/plot_match_representative.py b/src/snakemake/accuracy/plot_match_representative.py index c0d81d1..28f3e0c 100644 --- a/src/snakemake/accuracy/plot_match_representative.py +++ b/src/snakemake/accuracy/plot_match_representative.py @@ -14,17 +14,23 @@ k_size_order3 = [i*2 for i in k_order3] strobe_range = [k for k in range(8,17,2)] -def read_file(results, files): +def read_file(results, files,normalization = False): cov = 0.0 for file in files: with open(file, 'r') as f: for line in f: + if (line[:7] == "Matches"): + matches = int(line.split(' ')[1].split('\t')[0]) + missed = int(line.split('\t')[1].split(' ')[1].strip()) if (line[:7]=="Match C"): cov = round(float(line.split()[2]),2) if (line[:7]=="Islands"): mean = round(float(line.split('\t')[1]),2) stdev = round(float(line.split('\t')[2]),2) - results.append((mean,stdev,cov)) + if (normalization): + results.append((mean,stdev,cov/(matches))) + else: + results.append((mean,stdev,cov)) return results def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2): @@ -40,9 +46,9 @@ def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2 plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) - plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,20-s],1)-syncmer',linewidth=3.0) - plt.legend(bbox_to_anchor=(1.01, 0.75), title="Methods") + plt.legend(loc="upper left", title="Methods") plt.savefig(outfile1,bbox_inches='tight') # Plot comparison between all match coverage @@ -50,15 +56,15 @@ def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2 X = np.arange(len(k_size)) plt.xlabel("k") plt.xticks(pos, k_size) - plt.ylabel("Match coverage") + plt.ylabel("Match coverage") # /# of matches plt.plot(pos, [x[2] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) plt.plot(pos, [x[2] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) plt.plot(pos, [x[2] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) - plt.plot(pos, [x[2] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[2] for x in closedsyncmer], color = colors[3], label='(20,s,[0,20-s],1)-syncmer',linewidth=3.0) - plt.legend(loc="upper right", title="Methods") + plt.legend(loc="lower left", title="Methods") plt.savefig(outfile2,bbox_inches='tight') @@ -67,42 +73,49 @@ def plot_match(minimiser, modmer, opensyncmer, closedsyncmer, outfile1, outfile2 minimiser = read_file([], ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4)]) modmer = read_file([], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11]]) opensyncmer = read_file([],["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_match_"+str(error)+".out" for w in [15,11,7,3,1]]) + closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative"+str(error)+".png","../results/Match_cov_representative"+str(error)+".png") - minimiser = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) - modmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + minimiser = read_file([], ["0_minimiser_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in range(24,44,4)], True) + modmer = read_file([], ["0_modmer_hash_20_"+str(w)+"_match_"+str(error)+".out" for w in [3,5,7,9,11]], True) + opensyncmer = read_file([],["syncmer_hash_20_"+str(w)+"_0_0_match_"+str(error)+".out" for w in [18,16,14,12,10]], True) + closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for w in [15,11,7,3,1]], True) + print(minimiser) + plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative"+str(error)+".png","../results/Match_cov_representative_corrected"+str(error)+".png") + + minimiser = read_file([], ["hybridstrobemers_2_" +str(1)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["hybridstrobemers_2_" +str(1)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["hybridstrobemers_2_" +str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["hybridstrobemers_2_" +str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_hybrid1_"+str(error)+".png","../results/Match_cov_representative_hybrid1_"+str(error)+".png") - minimiser = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) - modmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + minimiser = read_file([], ["minstrobemers_2_" +str(1)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["minstrobemers_2_" +str(1)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["minstrobemers_2_" +str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["minstrobemers_2_" +str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_min1_"+str(error)+".png","../results/Match_cov_representative_min1_"+str(error)+".png") - minimiser = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) - modmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + minimiser = read_file([], ["randstrobemers_2_" +str(1)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["randstrobemers_2_" +str(1)+"_"+str(4+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["randstrobemers_2_" +str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["randstrobemers_2_" +str(1)+"_"+str(4+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_rand1_"+str(error)+".png","../results/Match_cov_representative_rand1_"+str(error)+".png") - minimiser = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) - modmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + minimiser = read_file([], ["hybridstrobemers_2_" +str(1)+"_"+str(8+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["hybridstrobemers_2_" +str(1)+"_"+str(8+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["hybridstrobemers_2_" +str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["hybridstrobemers_2_" +str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_hybrid_"+str(error)+".png","../results/Match_cov_representative_hybrid_"+str(error)+".png") - minimiser = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) - modmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + minimiser = read_file([], ["minstrobemers_2_" +str(1)+"_"+str(8+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["minstrobemers_2_" +str(1)+"_"+str(8+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["minstrobemers_2_" +str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["minstrobemers_2_" +str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_min_"+str(error)+".png","../results/Match_cov_representative_min_"+str(error)+".png") - minimiser = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) - modmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) - opensyncmer = read_file([],["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) - closedsyncmer = read_file([], ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_syncmer_hash_10_"+str(w)+"_0_6_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) + minimiser = read_file([], ["randstrobemers_2_" +str(1)+"_"+str(8+k)+"_minimiser_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in range(24,44,4)]) + modmer = read_file([], ["randstrobemers_2_" +str(1)+"_"+str(8+k)+"_modmer_hash_10_"+str(w)+"_match_"+str(error)+".out" for k in [10] for w in [3,5,7,9,11]]) + opensyncmer = read_file([],["randstrobemers_2_" +str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_0_match_"+str(error)+".out" for k in [10] for w in [18,16,14,12,10]]) + closedsyncmer = read_file([], ["randstrobemers_2_" +str(1)+"_"+str(8+k)+"_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_match_"+str(error)+".out" for k in [10] for w in [15,11,7,3,1]]) plot_match(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Match_island_representative_rand_"+str(error)+".png","../results/Match_cov_representative_rand_"+str(error)+".png") diff --git a/src/snakemake/count/Snakefile b/src/snakemake/count/Snakefile index 5cd00db..9039372 100644 --- a/src/snakemake/count/Snakefile +++ b/src/snakemake/count/Snakefile @@ -38,19 +38,19 @@ rule plot: [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+4)+"_"+str([k for k in range(16,34,2)][i]+4)+"_counts.out" for i in range(9)], [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+8)+"_"+str([k for k in range(16,34,2)][i]+8)+"_counts.out" for i in range(9)], # 4 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in [6,8,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_counts.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_counts.out" for k in [6,8,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in [6,8,10]], + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_counts.out" for k in [6,8,10]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in [6,8,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in [6,8,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in [6,8,10]] + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_counts.out" for k in [6,8,10]] output: "../results/Count_all.png", "../results/Count_all8.png", @@ -69,8 +69,8 @@ rule plot_representative: # syncmer ["syncmer_hash_20_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]], ["syncmer_hash_"+str(k)+"_10_0_0_counts.out" for k in [i for i in range(22,12,-2)]], - ["syncmer_hash_20_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]], - ["syncmer_hash_"+str(k)+"_3_0_6_counts.out" for k in [i for i in range(28,8,-4)]], + ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]], + ["syncmer_hash_"+str(k)+"_3_0_5_counts.out" for k in [i for i in range(28,8,-4)]], shell: "python3 plot_counts_representative.py" rule plot_representative2: @@ -89,29 +89,29 @@ rule plot_representative2: "../results/Count_representative_rand3.png" input: # minimiser based on strobemers - ["Rep2_"+smethod+"_2_0_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid"]], - ["Rep2_"+smethod+"_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["min","rand"]], - ["Rep2_"+smethod+"_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_1_18_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], # modmer based on strobemers - ["Rep2_"+smethod+"_2_0_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid"]], - ["Rep2_"+smethod+"_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["min","rand"]], - ["Rep2_"+smethod+"_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_1_18_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], # syncmer based on strobemers - [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid"]], - [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["min","rand"]], - [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid"]], + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["min","rand"]], + [smethod+"_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], # syncmer with two positions based on strobemers - [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid"]], - [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1] for smethod in ["min","rand"]], - [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]] + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid"]], + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1] for smethod in ["min","rand"]], + [smethod+"_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]] shell: "python3 plot_counts_representative2.py" @@ -174,19 +174,19 @@ rule unique_kmer: [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+4)+"_"+str([k for k in range(16,34,2)][i]+4)+"_GRCh38.p13.genome.fa_counts.out" for i in range(9)], [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_minimiser_hash_"+str([k for k in range(16,34,2)][i]+8)+"_"+str([k for k in range(16,34,2)][i]+8)+"_GRCh38.p13.genome.fa_counts.out" for i in range(9)], # 4 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]] + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_GRCh38.p13.genome.fa_counts.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_GRCh38.p13.genome.fa_counts.out" for k in [6,8,10]] output: "../results/Unique.out" shell: @@ -238,8 +238,8 @@ rule unique_representative: # syncmer ["syncmer_hash_20_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10]], ["syncmer_hash_"+str(k)+"_10_0_0_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(22,12,-2)]], - ["syncmer_hash_20_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1]], - ["syncmer_hash_"+str(k)+"_3_0_6_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(28,8,-4)]] + ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1]], + ["syncmer_hash_"+str(k)+"_3_0_5_GRCh38.p13.genome.fa_counts.out" for k in [i for i in range(28,8,-4)]] output: "../results/Unique_representative.out" shell: @@ -249,29 +249,29 @@ rule unique_representative: rule unique_representative2: input: # minimiser based on strobemers - ["Rep2_"+smethod+"_2_0_14_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid"]], - ["Rep2_"+smethod+"_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["min","rand"]], - ["Rep2_"+smethod+"_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_1_18_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(24,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_13_Strobemer_minimiser_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_17_Strobemer_minimiser_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [i for i in range(29,44,4)] for smethod in ["hybrid","min","rand"]], # modmer based on strobemers - ["Rep2_"+smethod+"_2_0_14_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid"]], - ["Rep2_"+smethod+"_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["min","rand"]], - ["Rep2_"+smethod+"_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], - ["Rep2_"+smethod+"_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid"]], + ["Rep2_"+smethod+"_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["min","rand"]], + ["Rep2_"+smethod+"_2_1_18_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [3,5,7,9,11] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_13_Strobemer_modmer_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], + ["Rep2_"+smethod+"_3_1_17_Strobemer_modmer_hash_9_"+str(w)+"_GRCh38.p13.genome.fa_counts.out" for w in [2,4,6,8] for smethod in ["hybrid","min","rand"]], # syncmer based on strobemers - [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid"]], - [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["min","rand"]], - [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid"]], + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["min","rand"]], + [smethod+"_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [18,16,14,12,10] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_GRCh38.p13.genome.fa_counts.out" for w in [26,24,22,20] for smethod in ["hybrid","min","rand"]], # syncmer with two positions based on strobemers - [smethod+"_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid"]], - [smethod+"_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["min","rand"]], - [smethod+"_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]], - [smethod+"_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_GRCh38.p13.genome.fa_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]] + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid"]], + [smethod+"_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["min","rand"]], + [smethod+"_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_counts.out" for w in [15,11,7,3,1] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_GRCh38.p13.genome.fa_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]], + [smethod+"_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_GRCh38.p13.genome.fa_counts.out" for w in [24,20,16,12] for smethod in ["hybrid","min","rand"]] output: "../results/Unique_representative2.out" shell: diff --git a/src/snakemake/count/plot_counts.py b/src/snakemake/count/plot_counts.py index a0b07f1..fc520a9 100644 --- a/src/snakemake/count/plot_counts.py +++ b/src/snakemake/count/plot_counts.py @@ -37,18 +37,18 @@ def read_file(results, files): shapes8_order3 = ['45607667', '3522001919', '205814423455'] gapped8_order3 = read_file([], [shapes8_order3[i] + "_minimiser_hash_"+str(k_size_order3[i]+8)+"_"+str(k_size_order3[i]+8)+"_counts.out" for i in range(len(k_order3))]) -minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in strobe_range]) -minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in k_order3]) -hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_counts.out" for k in strobe_range]) -hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_counts.out" for k in k_order3]) -randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_counts.out" for k in strobe_range]) -randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_counts.out" for k in k_order3]) -minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in strobe_range]) -minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in k_order3]) -hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in strobe_range]) -hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in k_order3]) -randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_counts.out" for k in strobe_range]) -randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_counts.out" for k in k_order3]) +minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_counts.out" for k in strobe_range]) +minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_counts.out" for k in k_order3]) +hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_counts.out" for k in strobe_range]) +hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_counts.out" for k in k_order3]) +randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_counts.out" for k in strobe_range]) +randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_counts.out" for k in k_order3]) +minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_counts.out" for k in strobe_range]) +minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_counts.out" for k in k_order3]) +hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_counts.out" for k in strobe_range]) +hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_counts.out" for k in k_order3]) +randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_counts.out" for k in strobe_range]) +randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_counts.out" for k in k_order3]) # Plot comparison between all fig = plt.figure() diff --git a/src/snakemake/count/plot_counts_representative.py b/src/snakemake/count/plot_counts_representative.py index 3f78796..bc599e4 100644 --- a/src/snakemake/count/plot_counts_representative.py +++ b/src/snakemake/count/plot_counts_representative.py @@ -21,8 +21,8 @@ def read_file(results, files): # syncmer opensyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) opensyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_10_0_0_counts.out" for k in [i for i in range(22,12,-2)]]) -closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) -closedsyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_3_0_6_counts.out" for k in [i for i in range(28,8,-4)]]) +closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) +closedsyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_3_0_5_counts.out" for k in [i for i in range(28,8,-4)]]) # Plot comparison between k-mers @@ -39,7 +39,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer',linewidth=3.0) plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) -plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,20-s],1)-syncmer',linewidth=3.0) plt.legend(bbox_to_anchor=(1.01, 0.75),title="Methods") plt.savefig("../results/Count_representative.png", bbox_inches='tight') diff --git a/src/snakemake/count/plot_counts_representative2.py b/src/snakemake/count/plot_counts_representative2.py index 8119190..aa5a77f 100644 --- a/src/snakemake/count/plot_counts_representative2.py +++ b/src/snakemake/count/plot_counts_representative2.py @@ -31,87 +31,87 @@ def create_plot(minimiser, modmer, opensyncmer, closedsyncmer, outfile,number_el plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer') plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) - plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,20-s],1)-syncmer',linewidth=3.0) else: plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,27)-minimizer',linewidth=3.0) plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(27,m)-modmer',linewidth=3.0) plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(27,s,[0],1)-syncmer',linewidth=3.0) - plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(27,s,[0,6],1)-syncmer',linewidth=3.0) + plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(27,s,[0,27-s],1)-syncmer',linewidth=3.0) plt.legend(title="Methods") plt.savefig(outfile, bbox_inches='tight') -minimiser = read_file([], ["Rep2_min_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) -modmer = read_file([], ["Rep2_min_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) -opensyncmer = read_file([], ["min_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) -closedsyncmer = read_file([], ["min_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +minimiser = read_file([], ["Rep2_min_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_min_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["min_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["min_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min1.png") -minimiser = read_file([], ["Rep2_rand_2_0_13_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) -modmer = read_file([], ["Rep2_rand_2_0_13_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) -opensyncmer = read_file([], ["rand_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) -closedsyncmer = read_file([], ["rand_2_0_13_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +minimiser = read_file([], ["Rep2_rand_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_rand_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["rand_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["rand_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand1.png") -minimiser = read_file([], ["Rep2_hybrid_2_0_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) -modmer = read_file([], ["Rep2_hybrid_2_0_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) -opensyncmer = read_file([], ["hybrid_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) -closedsyncmer = read_file([], ["hybrid_2_0_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +minimiser = read_file([], ["Rep2_hybrid_2_1_14_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_2_1_14_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["hybrid_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["hybrid_2_1_14_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid1.png") -minimiser = read_file([], ["Rep2_min_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) -modmer = read_file([], ["Rep2_min_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) -opensyncmer = read_file([], ["min_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) -closedsyncmer = read_file([], ["min_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +minimiser = read_file([], ["Rep2_min_2_1_18_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_min_2_1_18_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["min_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["min_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min.png") -minimiser = read_file([], ["Rep2_rand_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) -modmer = read_file([], ["Rep2_rand_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) -opensyncmer = read_file([], ["rand_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) -closedsyncmer = read_file([], ["rand_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +minimiser = read_file([], ["Rep2_rand_2_1_18_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_rand_2_1_18_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["rand_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["rand_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand.png") -minimiser = read_file([], ["Rep2_hybrid_2_0_17_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) -modmer = read_file([], ["Rep2_hybrid_2_0_17_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) -opensyncmer = read_file([], ["hybrid_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) -closedsyncmer = read_file([], ["hybrid_2_0_17_Strobemer_syncmer_hash_10_"+str(w)+"_0_6_counts.out" for w in [15,11,7,3,1]]) +minimiser = read_file([], ["Rep2_hybrid_2_1_18_Strobemer_minimiser_hash_10_"+str(w)+"_counts.out" for w in [i for i in range(24,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_2_1_18_Strobemer_modmer_hash_10_"+str(w)+"_counts.out" for w in [3,5,7,9,11]]) +opensyncmer = read_file([], ["hybrid_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_counts.out" for w in [18,16,14,12,10]]) +closedsyncmer = read_file([], ["hybrid_2_1_18_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_counts.out" for w in [15,11,7,3,1]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid.png") -minimiser = read_file([], ["Rep2_min_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) -modmer = read_file([], ["Rep2_min_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) -opensyncmer = read_file([], ["min_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) -closedsyncmer = read_file([], ["min_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) -create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min31.png", 4) - -minimiser = read_file([], ["Rep2_rand_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) -modmer = read_file([], ["Rep2_rand_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) -opensyncmer = read_file([], ["rand_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) -closedsyncmer = read_file([], ["rand_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) -create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand31.png", 4) - -minimiser = read_file([], ["Rep2_hybrid_3_0_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) -modmer = read_file([], ["Rep2_hybrid_3_0_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) -opensyncmer = read_file([], ["hybrid_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) -closedsyncmer = read_file([], ["hybrid_3_0_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +minimiser = read_file([], ["Rep2_min_3_1_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_min_3_1_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["min_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["min_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min31.png",4) + +minimiser = read_file([], ["Rep2_rand_3_1_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_rand_3_1_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["rand_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["rand_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12]]) +create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representatie_rand31.png", 4) + +minimiser = read_file([], ["Rep2_hybrid_3_1_13_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_3_1_13_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["hybrid_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["hybrid_3_1_13_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid31.png",4) -minimiser = read_file([], ["Rep2_min_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) -modmer = read_file([], ["Rep2_min_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) -opensyncmer = read_file([], ["min_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) -closedsyncmer = read_file([], ["min_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +minimiser = read_file([], ["Rep2_min_3_1_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_min_3_1_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["min_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["min_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_min3.png", 4) -minimiser = read_file([], ["Rep2_rand_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) -modmer = read_file([], ["Rep2_rand_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) -opensyncmer = read_file([], ["rand_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) -closedsyncmer = read_file([], ["rand_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +minimiser = read_file([], ["Rep2_rand_3_1_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_rand_3_1_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["rand_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["rand_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_rand3.png", 4) -minimiser = read_file([], ["Rep2_hybrid_3_0_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) -modmer = read_file([], ["Rep2_hybrid_3_0_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) -opensyncmer = read_file([], ["hybrid_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) -closedsyncmer = read_file([], ["hybrid_3_0_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_6_counts.out" for w in [24,20,16,12]]) +minimiser = read_file([], ["Rep2_hybrid_3_1_17_Strobemer_minimiser_hash_9_"+str(w)+"_counts.out" for w in [i for i in range(29,44,4)]]) +modmer = read_file([], ["Rep2_hybrid_3_1_17_Strobemer_modmer_hash_9_"+str(w)+"_counts.out" for w in [2,4,6,8]]) +opensyncmer = read_file([], ["hybrid_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_0_counts.out" for w in [26,24,22,20]]) +closedsyncmer = read_file([], ["hybrid_3_1_17_Strobemer_syncmer_hash_9_"+str(w)+"_0_"+str(27-w)+"_counts.out" for w in [24,20,16,12]]) create_plot(minimiser, modmer, opensyncmer, closedsyncmer, "../results/Count_representative_hybrid3.png",4) @@ -180,7 +180,7 @@ def create_plot(minimiser, modmer, opensyncmer, closedsyncmer, outfile,number_el with open("../results/Unique_representative2.out", 'r') as f: for line in f: number = float(line.split()[1]) - if ("2_0_17_" in line): + if ("2_1_18_" in line): if ("hybrid" in line): if ("minimiser" in line): minimiser_hybrid.append(number) @@ -208,7 +208,7 @@ def create_plot(minimiser, modmer, opensyncmer, closedsyncmer, outfile,number_el opensyncmer_min.append(number) else: closedsyncmer_min.append(number) - elif ("3_0_17_" in line): + elif ("3_1_17_" in line): if ("hybrid" in line): if ("minimiser" in line): minimiser_hybrid3.append(number) @@ -236,7 +236,7 @@ def create_plot(minimiser, modmer, opensyncmer, closedsyncmer, outfile,number_el opensyncmer_min3.append(number) else: closedsyncmer_min3.append(number) - elif ("3_0_13_" in line): + elif ("3_1_13_" in line): if ("hybrid" in line): if ("minimiser" in line): minimiser_hybrid31.append(number) @@ -312,12 +312,12 @@ def plot_unique(minimiser, modmer, opensyncmer, closedsyncmer, outfile, num_elem plt.plot(pos, minimiser[:5], color = colors[0], label='(w,20)-minimizer',linewidth=3.0) plt.plot(pos, modmer[:5], color = colors[1], label='(20,m)-modmer',linewidth=3.0) plt.plot(pos, opensyncmer[:5], color = colors[2], label='(20,s,[0],1)-syncmer',linewidth=3.0) - plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) + plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(20,s,[0,20-s],1)-syncmer',linewidth=3.0) else: plt.plot(pos, minimiser[:5], color = colors[0], label='(w,27)-minimizer',linewidth=3.0) plt.plot(pos, modmer[:5], color = colors[1], label='(27,m)-modmer',linewidth=3.0) plt.plot(pos, opensyncmer[:5], color = colors[2], label='(27,s,[0],1)-syncmer',linewidth=3.0) - plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(27,s,[0,6],1)-syncmer',linewidth=3.0) + plt.plot(pos, closedsyncmer[:5], color = colors[3], label='(27,s,[0,27-s],1)-syncmer',linewidth=3.0) plt.legend(title="Methods") plt.savefig(outfile, bbox_inches='tight') diff --git a/src/snakemake/distance/Snakefile b/src/snakemake/distance/Snakefile index 4be23e7..88561d5 100644 --- a/src/snakemake/distance/Snakefile +++ b/src/snakemake/distance/Snakefile @@ -1,17 +1,35 @@ rule all: input: # Representative - ["0_minimiser_hash_20_"+str(w)+"_distance_"+str(error)+".out" for w in range(24,44,4) for error in [1,2,5,10]], - ["0_modmer_hash_20_"+str(w)+"_distance_"+str(error)+".out" for w in [3,5,7,9,11] for error in [1,2,5,10]], - ["syncmer_hash_20_"+str(w)+"_0_0_distance_"+str(error)+".out" for w in [18,16,14,12,10] for error in [1,2,5,10]], - ["syncmer_hash_20_"+str(w)+"_0_6_distance_"+str(error)+".out" for w in [15,11,7,3,1] for error in [1,2,5,10]], + ["0_minimiser_hash_20_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for w in range(24,44,4)], + ["0_modmer_hash_20_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for w in [3,5,7,9,11]], + ["syncmer_hash_20_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for w in [18,16,14,12,10]], + ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for w in [15,11,7,3,1]], # Representative based on strobemers - ["hybridstrobemers_2_"+str(0)+"_"+str(4+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(3+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["hybridstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["minstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]], - ["randstrobemers_2_"+str(0)+"_"+str(7+k)+"_minimiser_hash_10_"+str(w)+"_distance_"+str(error)+".out" for k in [10] for w in range(24,44,4) for error in [1,2,5,10]] + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in range(24,44,4)], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in range(24,44,4)], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in range(24,44,4)], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in range(24,44,4)], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in range(24,44,4)], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_minimiser_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in range(24,44,4)], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [3,5,7,9,11]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [3,5,7,9,11]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [3,5,7,9,11]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [3,5,7,9,11]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [3,5,7,9,11]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_modmer_hash_10_"+str(w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [3,5,7,9,11]], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [18,16,14,12,10]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [18,16,14,12,10]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [18,16,14,12,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [18,16,14,12,10]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [18,16,14,12,10]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_0_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [18,16,14,12,10]], + ["hybridstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [15,11,7,3,1]], + ["minstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [15,11,7,3,1]], + ["randstrobemers_2_"+str(1)+"_"+str(4+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [15,11,7,3,1]], + ["hybridstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [15,11,7,3,1]], + ["minstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [15,11,7,3,1]], + ["randstrobemers_2_"+str(1)+"_"+str(8+k)+"_Strobemer_syncmer_hash_10_"+str(w)+"_0_"+str(20-w)+"_GRCh38.p13.genome.fa_distances.out" for k in [10] for w in [15,11,7,3,1]] rule download_human_genome: output: @@ -23,26 +41,26 @@ rule distance_minimiser_modmer: input: "../results/GRCh38.p13.genome.fa.gz" output: - "{shape}_{method}_hash_{kmer_size}_{w_size}_distance_{error}.out" + "{shape}_{method}_hash_{kmer_size}_{w_size}_GRCh38.p13.genome.fa_distances.out" wildcard_constraints: shape='[0-9]*', method='(modmer|minimiser)' shell: - "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape {wildcards.shape} -o {wildcards.shape}_ {input} > {wildcards.shape}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_distance_{wildcards.error}.out" + "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --shape {wildcards.shape} -o {wildcards.shape}_ {input}" rule distance_syncmer: input: "../results/GRCh38.p13.genome.fa.gz" output: - "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_distance_{error}.out", + "syncmer_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_GRCh38.p13.genome.fa_distances.out", shell: - "minions distance --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input} > syncmer_hash_{wildcards.kmer_size}_{wildcards.w_size}_{wildcards.pos_begin}_{wildcards.pos_end}_distance_{wildcards.error}.out" + "minions distance --method syncmer -k {wildcards.kmer_size} -w {wildcards.w_size} -p {wildcards.pos_begin} -p {wildcards.pos_end} --shape 0 {input}" rule distance_strobemer: input: "../results/GRCh38.p13.genome.fa.gz" output: - "{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}_distance_{error}.out" + "{method}strobemers_{kmer_size}_{order}_{wmin}_{wmax}_GRCh38.p13.genome.fa_distances.out" wildcard_constraints: method='(min|rand|hybrid)', kmer_size='[0-9]*', @@ -50,13 +68,13 @@ rule distance_strobemer: wmin='[0-9]*', wmax='[0-9]*' shell: - "minions distance --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input} > {wildcards.method}strobemers_{wildcards.kmer_size}_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_distance_{wildcards.error}.out" + "minions distance --method strobemer --{wildcards.method} -k {wildcards.kmer_size} --w-min {wildcards.wmin} --w-max {wildcards.wmax} --order {wildcards.order} {input}" rule distance_minimiser_modmer_strobemer: input: "../results/GRCh38.p13.genome.fa.gz" output: - "{method2}strobemers_{order}_{wmin}_{wmax}_{method}_hash_{kmer_size}_{w_size}_distance_{error}.out" + "{method2}strobemers_{order}_{wmin}_{wmax}_Strobemer_{method}_hash_{kmer_size}_{w_size}_GRCh38.p13.genome.fa_distances.out" wildcard_constraints: method='(modmer|minimiser)', method2='(min|rand|hybrid)', @@ -66,4 +84,20 @@ rule distance_minimiser_modmer_strobemer: wmin='[0-9]*', wmax='[0-9]*' shell: - "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} {input} > {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_{wildcards.method}_hash_{wildcards.kmer_size}_{wildcards.w_size}_distance_{wildcards.error}.out" + "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} -o {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_ {input}" + +rule distance_syncmer_strobemer: + input: + "../results/GRCh38.p13.genome.fa.gz" + output: + "{method2}strobemers_{order}_{wmin}_{wmax}_Strobemer_{method}_hash_{kmer_size}_{w_size}_{pos_begin}_{pos_end}_GRCh38.p13.genome.fa_distances.out" + wildcard_constraints: + method='(syncmer)', + method2='(min|rand|hybrid)', + order='(2|3)', + kmer_size='[0-9]*', + w_size='[0-9]*', + wmin='[0-9]*', + wmax='[0-9]*' + shell: + "minions distance --method {wildcards.method} -k {wildcards.kmer_size} -w {wildcards.w_size} --strobemer --w-min {wildcards.wmin} --w-max {wildcards.wmax} --{wildcards.method2} --order {wildcards.order} -p {wildcards.pos_begin} -p {wildcards.pos_end} -o {wildcards.method2}strobemers_{wildcards.order}_{wildcards.wmin}_{wildcards.wmax}_ {input}" diff --git a/src/snakemake/genmap/genmap_uniqueness.py b/src/snakemake/genmap/genmap_uniqueness.py index 767dfe1..1bbc553 100644 --- a/src/snakemake/genmap/genmap_uniqueness.py +++ b/src/snakemake/genmap/genmap_uniqueness.py @@ -25,7 +25,7 @@ def get_results(species): results[errors.index(e)].append(get_unique(genmap_file)) fig = plt.figure() X = np.arange(len(k_mers)) - colors = ["#00ba32","#00d6e7","#fad100"] + colors = ["#00ba32","#00d6e7","#fad100"] # ["#1b9e77","#d95f02","#7570b3"] #["#00ba32","#00d6e7","#fad100"] pos = [0.25,1.25,2.25,3.25,4.25] plt.xlabel("k") plt.xticks(pos, k_mers) diff --git a/src/snakemake/speed/Snakefile b/src/snakemake/speed/Snakefile index 61ce6cb..b733b61 100644 --- a/src/snakemake/speed/Snakefile +++ b/src/snakemake/speed/Snakefile @@ -35,29 +35,29 @@ rule plot: [["777695", "2621175", "16252901", "50196477", "251620351", "905838335", "4286578095", "13958643693", "66035113981"][i]+"_kmer_hash_"+str([k for k in range(16,34,2)][i]+4)+"_speed.out" for i in range(9)], [["14021527", "45607667", "180082591", "1068161519", "3522001919", "13957854679", "64423783901", "205814423455", "1094946651927"][i]+"_kmer_hash_"+str([k for k in range(16,34,2)][i]+8)+"_speed.out" for i in range(9)], # 4 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [6,8,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in [6,8,10]], + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], # 8 "gaps" - ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [6,8,10]], - ["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [6,8,10]], - ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in range(8,17)], - ["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in [6,8,10]], + ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], + ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_speed.out" for k in [6,8,10]], + ["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], + ["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_speed.out" for k in [6,8,10]], + ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], + ["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_speed.out" for k in [6,8,10]], # 4 "gaps" - ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], + ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], - ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in range(8,17)], - ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in [6,8,10]], + ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in [6,8,10]], # 8 "gaps" - ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], - ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], - ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in range(8,17)], - ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in [6,8,10]] + ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17)], + ["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in range(8,17)], + ["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in [6,8,10]] shell: "python3 plot_speed.py" rule plot_representative: @@ -73,8 +73,8 @@ rule plot_representative: # syncmer ["syncmer_hash_20_"+str(w)+"_0_0_speed.out" for w in [18,16,14,12,10]], ["syncmer_hash_"+str(k)+"_10_0_0_speed.out" for k in [i for i in range(22,12,-2)]], - ["syncmer_hash_20_"+str(w)+"_0_6_speed.out" for w in [15,11,7,3,1]], - ["syncmer_hash_"+str(k)+"_3_0_6_speed.out" for k in [i for i in range(28,8,-4)]] + ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_speed.out" for w in [15,11,7,3,1]] + #["syncmer_hash_"+str(k)+"_3_0_6_speed.out" for k in [i for i in range(28,8,-4)]] shell: "python3 plot_speed_representative.py" rule download_example_Data: diff --git a/src/snakemake/speed/plot_speed.py b/src/snakemake/speed/plot_speed.py index 75a2209..4454d64 100644 --- a/src/snakemake/speed/plot_speed.py +++ b/src/snakemake/speed/plot_speed.py @@ -40,20 +40,20 @@ def read_file(results, files): shapes8_order3 = ['45607667', '3522001919', '205814423455'] gapped8_order3 = read_file([], [shapes8_order3[i] + "_kmer_hash_"+str(k_size_order3[i])+"_speed.out" for i in range(len(k_order3))]) -minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in strobe_range]) -minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in k_order3]) -hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(4+k)+"_speed.out" for k in k_order3]) -randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(3+k)+"_speed.out" for k in strobe_range]) -randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(3+k)+"_speed.out" for k in k_order3]) -minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) -minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) -hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) -hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) -randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(0)+"_"+str(7+k)+"_speed.out" for k in strobe_range]) -randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(0)+"_"+str(7+k)+"_speed.out" for k in k_order3]) -original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) +minstrobemers2 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +minstrobemers3 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_speed.out" for k in k_order3]) +hybridstrobemers2 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers3 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_speed.out" for k in k_order3]) +randstrobemers2 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +randstrobemers3 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(4+k)+"_speed.out" for k in k_order3]) +minstrobemers28 = read_file([], ["minstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +minstrobemers38 = read_file([], ["minstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) +hybridstrobemers28 = read_file([],["hybridstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +hybridstrobemers38 = read_file([],["hybridstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) +randstrobemers28 = read_file([], ["randstrobemers_"+str(k)+"_2_"+str(1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) +randstrobemers38 = read_file([],["randstrobemers_"+str(k)+"_3_"+str(1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) +original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in k_order3]) # Plot comparison between k-mers fig = plt.figure() @@ -160,8 +160,6 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size)) -colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] -colors = ["#004c6d","#009dbe","#00f6ff","#fee6ce","#fdae6b","#e6550d"] colors = ["#004c6d","#009dbe","#00f6ff","#fdcc8a","#fc8d59","#d7301f"] plt.xlabel("k") plt.xticks(pos, k_size) @@ -183,7 +181,6 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size)) -#colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds @@ -204,7 +201,6 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size_order3)) -#colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos_order3, k_size_order3) plt.ylabel("Speed in microseconds") # in microseconds @@ -225,7 +221,6 @@ def read_file(results, files): fig = plt.figure() X = np.arange(len(k_size_order3)) -#colors = ["#00ba32","#00d6e7","#fad100","#697ed5","#c76674","#9350a1"] plt.xlabel("k") plt.xticks(pos_order3, k_size_order3) plt.ylabel("Speed in microseconds") # in microseconds @@ -283,18 +278,18 @@ def read_file(results, files): # Plot comparison between strobemers all gaps -original_minstrobemers2 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) +original_minstrobemers2 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) original_hybridstrobemers2 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) -original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in strobe_range]) -original_randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(4+k)+"_speed.out" for k in k_order3]) -original_minstrobemers28 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -original_hybridstrobemers28 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -original_randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in strobe_range]) -original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(8+k)+"_speed.out" for k in k_order3]) +original_randstrobemers2 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers3 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(5+k)+"_speed.out" for k in k_order3]) +original_minstrobemers28 = read_file([], ["Original_minstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in strobe_range]) +original_hybridstrobemers28 = read_file([],["Original_hybridstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers28 = read_file([], ["Original_randstrobemers_"+str(k)+"_2_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in strobe_range]) +original_randstrobemers38 = read_file([],["Original_randstrobemers_"+str(k)+"_3_"+str(k+1)+"_"+str(9+k)+"_speed.out" for k in k_order3]) fig = plt.figure() X = np.arange(len(k_size)) -#colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] +colors = ["#697ed5","#c76674","#9350a1","#00ba32","#00d6e7","#fad100"] colors_error = ["#748beb","#e47585","#b261c2","#01d63a","#00e7e0","#fefea1"] plt.xlabel("k") plt.xticks(pos, k_size) @@ -323,13 +318,11 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds colors_ori = ["#bae4bc","#7bccc4","#43a2ca","#0868ac"] -colors_ori = ["#e66101","#fdb863","#b2abd2","#5e3c99"] -colors_ori = ["#dfc27d","#a6611a","#80cdc1","#018571"] -plt.plot(pos, [x[0] for x in minstrobemers2], color = colors_ori[0], label='4', linewidth=3.0) -plt.plot(pos, [x[0] for x in minstrobemers28], color = colors_ori[1], label='8',linewidth=3.0) -plt.plot(pos, [x[0] for x in original_minstrobemers2], color = colors_ori[2], label='4 ori',linewidth=3.0) -plt.plot(pos, [x[0] for x in original_minstrobemers28], color = colors_ori[3], label='8 ori',linewidth=3.0) +plt.plot(pos, [x[0] for x in minstrobemers2], color = colors_ori[0], label='4') +plt.plot(pos, [x[0] for x in minstrobemers28], color = colors_ori[1], label='8') +plt.plot(pos, [x[0] for x in original_minstrobemers2], color = colors_ori[2], label='4 ori') +plt.plot(pos, [x[0] for x in original_minstrobemers28], color = colors_ori[3], label='8 ori') #plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers2], [x[0]+x[1] for x in minstrobemers2], color = colors_error[2], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in minstrobemers28], [x[0]+x[1] for x in minstrobemers28], color = colors_error[5], alpha=0.7) @@ -344,10 +337,10 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors_ori[0], label='4',linewidth=3.0) -plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors_ori[1], label='8',linewidth=3.0) -plt.plot(pos, [x[0] for x in original_hybridstrobemers2], color = colors_ori[2], label='4 ori',linewidth=3.0) -plt.plot(pos, [x[0] for x in original_hybridstrobemers28], color = colors_ori[3], label='8 ori',linewidth=3.0) +plt.plot(pos, [x[0] for x in hybridstrobemers2], color = colors_ori[0], label='4') +plt.plot(pos, [x[0] for x in hybridstrobemers28], color = colors_ori[1], label='8') +plt.plot(pos, [x[0] for x in original_hybridstrobemers2], color = colors_ori[2], label='4 ori') +plt.plot(pos, [x[0] for x in original_hybridstrobemers28], color = colors_ori[3], label='8 ori') #plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers2], [x[0]+x[1] for x in hybridstrobemers2], color = colors_error[2], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in hybridstrobemers28], [x[0]+x[1] for x in hybridstrobemers28], color = colors_error[5], alpha=0.7) @@ -362,10 +355,10 @@ def read_file(results, files): plt.xticks(pos, k_size) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos, [x[0] for x in randstrobemers2], color = colors_ori[0], label='4',linewidth=3.0) -plt.plot(pos, [x[0] for x in randstrobemers28], color = colors_ori[1], label='8',linewidth=3.0) -plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors_ori[2], label='4 ori',linewidth=3.0) -plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors_ori[3], label='8 ori',linewidth=3.0) +plt.plot(pos, [x[0] for x in randstrobemers2], color = colors_ori[0], label='4') +plt.plot(pos, [x[0] for x in randstrobemers28], color = colors_ori[1], label='8') +plt.plot(pos, [x[0] for x in original_randstrobemers2], color = colors_ori[2], label='4 ori') +plt.plot(pos, [x[0] for x in original_randstrobemers28], color = colors_ori[3], label='8 ori') #plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers2], [x[0]+x[1] for x in randstrobemers2], color = colors_error[2], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in randstrobemers28], [x[0]+x[1] for x in randstrobemers28], color = colors_error[5], alpha=0.7) @@ -380,10 +373,10 @@ def read_file(results, files): plt.xticks(pos_order3, k_order3) plt.ylabel("Speed in microseconds") # in microseconds -plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors_ori[0], label='4',linewidth=3.0) -plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors_ori[1], label='8',linewidth=3.0) -plt.plot(pos_order3, [x[0] for x in original_randstrobemers3], color = colors_ori[2], label='4 ori',linewidth=3.0) -plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors_ori[3], label='8 ori',linewidth=3.0) +plt.plot(pos_order3, [x[0] for x in randstrobemers3], color = colors_ori[0], label='4') +plt.plot(pos_order3, [x[0] for x in randstrobemers38], color = colors_ori[1], label='8') +plt.plot(pos_order3, [x[0] for x in original_randstrobemers3], color = colors_ori[2], label='4 ori') +plt.plot(pos_order3, [x[0] for x in original_randstrobemers38], color = colors_ori[3], label='8 ori') #plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers3], [x[0]+x[1] for x in randstrobemers3], color = colors_error[2], alpha=0.7) #plt.fill_between(pos_order3, [x[0]-x[1] for x in randstrobemers38], [x[0]+x[1] for x in randstrobemers38], color = colors_error[5], alpha=0.7) diff --git a/src/snakemake/speed/plot_speed_representative.py b/src/snakemake/speed/plot_speed_representative.py index 8937340..c9b3eda 100644 --- a/src/snakemake/speed/plot_speed_representative.py +++ b/src/snakemake/speed/plot_speed_representative.py @@ -21,8 +21,8 @@ def read_file(results, files): # syncmer opensyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_0_speed.out" for w in [18,16,14,12,10]]) opensyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_10_0_0_speed.out" for k in [i for i in range(22,12,-2)]]) -closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_6_speed.out" for w in [15,11,7,3,1]]) -closedsyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_3_0_6_speed.out" for k in [i for i in range(28,8,-4)]]) +closedsyncmer = read_file([], ["syncmer_hash_20_"+str(w)+"_0_"+str(20-w)+"_speed.out" for w in [15,11,7,3,1]]) +#closedsyncmer_setw = read_file([], ["syncmer_hash_"+str(k)+"_3_0_6_speed.out" for k in [i for i in range(28,8,-4)]]) # Plot comparison between k-mers @@ -40,7 +40,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in minimiser], color = colors[0], label='(w,20)-minimizer', linewidth=3.0) plt.plot(pos, [x[0] for x in modmer], color = colors[1], label='(20,m)-modmer', linewidth=3.0) plt.plot(pos, [x[0] for x in opensyncmer], color = colors[2], label='(20,s,[0],1)-syncmer', linewidth=3.0) -plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,6],1)-syncmer',linewidth=3.0) +plt.plot(pos, [x[0] for x in closedsyncmer], color = colors[3], label='(20,s,[0,20-s],1)-syncmer',linewidth=3.0) #plt.fill_between(pos, [x[0]-x[1] for x in minimiser], [x[0]+x[1] for x in minimiser], color = colors_error[0], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in modmer], [x[0]+x[1] for x in modmer], color = colors_error[1], alpha=0.7) @@ -66,7 +66,7 @@ def read_file(results, files): plt.plot(pos, [x[0] for x in minimiser_setw], color = colors[0], label='(40,k)-minimizer',linewidth=3.0) plt.plot(pos, [x[0] for x in modmer_setw], color = colors[1], label='(k,7)-modmer',linewidth=3.0) plt.plot(pos, [x[0] for x in opensyncmer_setw], color = colors[2], label='(k,10,[0],1)-syncmer',linewidth=3.0) -plt.plot(pos, [x[0] for x in closedsyncmer_setw], color = colors[3], label='(k,3,[0,6],1)-syncmer',linewidth=3.0) +#plt.plot(pos, [x[0] for x in closedsyncmer_setw], color = colors[3], label='(k,3,[0,6],1)-syncmer',linewidth=3.0) #plt.fill_between(pos, [x[0]-x[1] for x in minimiser_setw], [x[0]+x[1] for x in minimiser_setw], color = colors_error[0], alpha=0.7) #plt.fill_between(pos, [x[0]-x[1] for x in modmer_setw], [x[0]+x[1] for x in modmer_setw], color = colors_error[1], alpha=0.7) From 978d339ac75fdcc69511e830e1eeb21bbefc7faa Mon Sep 17 00:00:00 2001 From: MitraDarja Date: Thu, 17 Aug 2023 15:02:47 +0200 Subject: [PATCH 31/34] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1378a25..e82110e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Currently, the following methods are supported: - k-mers - minimizers - modmers -- strobemers (integrated as submodule from [here](https://github.com/ksahlin/strobemers) and implemented as a view) +- strobemers (integrated as submodule from [here](https://github.com/ksahlin/strobemers) and implemented as a view for hybrid-, min- and randstrobemers) - syncmers See Issue #1 for a list of methods that will be added in the future (see down below here for an example usage of each method). From 8767a775b6abd1af6d9a1ad9dbd33770c4e262fb Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Thu, 24 Aug 2023 13:47:18 +0200 Subject: [PATCH 32/34] [FIX] Static assert. --- include/minions_minimiser.hpp | 589 +++++++++++++++++++++++++++++ include/minions_minimiser_hash.hpp | 175 +++++++++ src/compare.cpp | 38 +- test/api/minstrobe_hash_test.cpp | 4 +- test/api/randstrobe_hash_test.cpp | 4 +- test/cli/minions_match_test.cpp | 4 +- 6 files changed, 790 insertions(+), 24 deletions(-) create mode 100644 include/minions_minimiser.hpp create mode 100644 include/minions_minimiser_hash.hpp diff --git a/include/minions_minimiser.hpp b/include/minions_minimiser.hpp new file mode 100644 index 0000000..1a924e4 --- /dev/null +++ b/include/minions_minimiser.hpp @@ -0,0 +1,589 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides seqan3::views::minimiser. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace minions::detail +{ +// --------------------------------------------------------------------------------------------------------------------- +// minimiser_view class +// --------------------------------------------------------------------------------------------------------------------- + +/*!\brief The type returned by seqan3::views::minimiser. + * \tparam urng1_t The type of the underlying range, must model std::ranges::forward_range, the reference type must + * model std::totally_ordered. The typical use case is that the reference type is the result of + * seqan3::kmer_hash. + * \tparam urng2_t The type of the second underlying range, must model std::ranges::forward_range, the reference type + * must model std::totally_ordered. If only one range is provided this defaults to + * std::ranges::empty_view. + * \implements std::ranges::view + * \ingroup search_views + * + * \details + * + * See seqan3::views::minimiser for a detailed explanation on minimizers. + * + * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. + * + * \sa seqan3::views::minimiser + */ +template > +class minimiser_view : public std::ranges::view_interface> +{ +private: + static_assert(std::ranges::forward_range, "The minimiser_view only works on forward_ranges."); + static_assert(std::ranges::forward_range, "The minimiser_view only works on forward_ranges."); + static_assert(std::totally_ordered>, + "The reference type of the underlying range must model std::totally_ordered."); + + //!\brief The default argument of the second range. + using default_urng2_t = std::ranges::empty_view; + + //!\brief Boolean variable, which is true, when second range is not of empty type. + static constexpr bool second_range_is_given = !std::same_as; + + static_assert(!second_range_is_given || std::totally_ordered_with, + std::ranges::range_reference_t>, + "The reference types of the underlying ranges must model std::totally_ordered_with."); + + //!\brief Whether the given ranges are const_iterable + static constexpr bool const_iterable = seqan3::const_iterable_range && + seqan3::const_iterable_range; + + //!\brief The first underlying range. + urng1_t urange1{}; + //!\brief The second underlying range. + urng2_t urange2{}; + + //!\brief The number of values in one window. + size_t window_size{}; + + template + class basic_iterator; + + //!\brief The sentinel type of the minimiser_view. + using sentinel = std::default_sentinel_t; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + minimiser_view() = default; //!< Defaulted. + minimiser_view(minimiser_view const & rhs) = default; //!< Defaulted. + minimiser_view(minimiser_view && rhs) = default; //!< Defaulted. + minimiser_view & operator=(minimiser_view const & rhs) = default; //!< Defaulted. + minimiser_view & operator=(minimiser_view && rhs) = default; //!< Defaulted. + ~minimiser_view() = default; //!< Defaulted. + + /*!\brief Construct from a view and a given number of values in one window. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + minimiser_view(urng1_t urange1, size_t const window_size) : + minimiser_view{std::move(urange1), default_urng2_t{}, window_size} + {} + + /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from>>) + //!\endcond + minimiser_view(other_urng1_t && urange1, size_t const window_size) : + urange1{std::views::all(std::forward(urange1))}, + urange2{default_urng2_t{}}, + window_size{window_size} + {} + + /*!\brief Construct from two views and a given number of values in one window. + * \param[in] urange1 The first input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + minimiser_view(urng1_t urange1, urng2_t urange2, size_t const window_size) : + urange1{std::move(urange1)}, + urange2{std::move(urange2)}, + window_size{window_size} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + + /*!\brief Construct from two non-views that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \tparam other_urng2_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng2_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from> && + std::ranges::viewable_range && + std::constructible_from>) + //!\endcond + minimiser_view(other_urng1_t && urange1, other_urng2_t && urange2, size_t const window_size) : + urange1{std::views::all(std::forward(urange1))}, + urange2{std::views::all(std::forward(urange2))}, + window_size{window_size} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + //!\} + + /*!\name Iterators + * \{ + */ + /*!\brief Returns an iterator to the first element of the range. + * \returns Iterator to the first element. + * + * \details + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * Strong exception guarantee. + */ + basic_iterator begin() + { + return {std::ranges::begin(urange1), + std::ranges::end(urange1), + std::ranges::begin(urange2), + window_size}; + } + + //!\copydoc begin() + basic_iterator begin() const + //!\cond + requires const_iterable + //!\endcond + { + return {std::ranges::cbegin(urange1), + std::ranges::cend(urange1), + std::ranges::cbegin(urange2), + window_size}; + } + + /*!\brief Returns an iterator to the element following the last element of the range. + * \returns Iterator to the end. + * + * \details + * + * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + sentinel end() const + { + return {}; + } + //!\} +}; + +//!\brief Iterator for calculating minimisers. +template +template +class minimiser_view::basic_iterator +{ +private: + //!\brief The sentinel type of the first underlying range. + using urng1_sentinel_t = seqan3::detail::maybe_const_sentinel_t; + //!\brief The iterator type of the first underlying range. + using urng1_iterator_t = seqan3::detail::maybe_const_iterator_t; + //!\brief The iterator type of the second underlying range. + using urng2_iterator_t = seqan3::detail::maybe_const_iterator_t; + + template + friend class basic_iterator; + +public: + /*!\name Associated types + * \{ + */ + //!\brief Type for distances between iterators. + using difference_type = std::ranges::range_difference_t; + //!\brief Value type of this iterator. + using value_type = std::ranges::range_value_t; + //!\brief The pointer type. + using pointer = void; + //!\brief Reference to `value_type`. + using reference = value_type; + //!\brief Tag this class as a forward iterator. + using iterator_category = std::forward_iterator_tag; + //!\brief Tag this class as a forward iterator. + using iterator_concept = iterator_category; + //!\} + + /*!\name Constructors, destructor and assignment + * \{ + */ + basic_iterator() = default; //!< Defaulted. + basic_iterator(basic_iterator const &) = default; //!< Defaulted. + basic_iterator(basic_iterator &&) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. + ~basic_iterator() = default; //!< Defaulted. + + //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. + basic_iterator(basic_iterator const & it) + //!\cond + requires const_range + //!\endcond + : minimiser_value{std::move(it.minimiser_value)}, + urng1_iterator{std::move(it.urng1_iterator)}, + urng1_sentinel{std::move(it.urng1_sentinel)}, + urng2_iterator{std::move(it.urng2_iterator)}, + window_values{std::move(it.window_values)} + {} + + /*!\brief Construct from begin and end iterators of a given range over std::totally_ordered values, and the number + of values per window. + * \param[in] urng1_iterator Iterator pointing to the first position of the first std::totally_ordered range. + * \param[in] urng1_sentinel Iterator pointing to the last position of the first std::totally_ordered range. + * \param[in] urng2_iterator Iterator pointing to the first position of the second std::totally_ordered range. + * \param[in] window_size The number of values in one window. + * + * \details + * + * Looks at the number of values per window in two ranges, returns the smallest between both as minimiser and + * shifts then by one to repeat this action. If a minimiser in consecutive windows is the same, it is returned only + * once. + */ + basic_iterator(urng1_iterator_t urng1_iterator, + urng1_sentinel_t urng1_sentinel, + urng2_iterator_t urng2_iterator, + size_t window_size) : + urng1_iterator{std::move(urng1_iterator)}, + urng1_sentinel{std::move(urng1_sentinel)}, + urng2_iterator{std::move(urng2_iterator)} + { + size_t size = std::ranges::distance(urng1_iterator, urng1_sentinel); + window_size = std::min(window_size, size); + + window_first(window_size); + } + //!\} + + //!\anchor basic_iterator_comparison + //!\name Comparison operators + //!\{ + + //!\brief Compare to another basic_iterator. + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) + { + return (lhs.urng1_iterator == rhs.urng1_iterator) && + (rhs.urng2_iterator == rhs.urng2_iterator) && + (lhs.window_values.size() == rhs.window_values.size()); + } + + //!\brief Compare to another basic_iterator. + friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator==(basic_iterator const & lhs, sentinel const &) + { + return lhs.urng1_iterator == lhs.urng1_sentinel; + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator==(sentinel const & lhs, basic_iterator const & rhs) + { + return rhs == lhs; + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator!=(sentinel const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the minimiser_view. + friend bool operator!=(basic_iterator const & lhs, sentinel const & rhs) + { + return !(lhs == rhs); + } + //!\} + + //!\brief Pre-increment. + basic_iterator & operator++() noexcept + { + next_unique_minimiser(); + return *this; + } + + //!\brief Post-increment. + basic_iterator operator++(int) noexcept + { + basic_iterator tmp{*this}; + next_unique_minimiser(); + return tmp; + } + + //!\brief Return the minimiser. + value_type operator*() const noexcept + { + return minimiser_value; + } + +private: + //!\brief The minimiser value. + value_type minimiser_value{}; + + //!\brief The offset relative to the beginning of the window where the minimizer value is found. + size_t minimiser_position_offset{}; + + //!\brief Iterator to the rightmost value of one window. + urng1_iterator_t urng1_iterator{}; + //!brief Iterator to last element in range. + urng1_sentinel_t urng1_sentinel{}; + //!\brief Iterator to the rightmost value of one window of the second range. + urng2_iterator_t urng2_iterator{}; + + //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current minimiser. + std::deque window_values{}; + + //!\brief Increments iterator by 1. + void next_unique_minimiser() + { + while (!next_minimiser()) {} + } + + //!\brief Returns new window value. + auto window_value() const + { + if constexpr (!second_range_is_given) + return *urng1_iterator; + else + return std::min(*urng1_iterator, *urng2_iterator); + } + + //!\brief Advances the window to the next position. + void advance_window() + { + ++urng1_iterator; + if constexpr (second_range_is_given) + ++urng2_iterator; + } + + //!\brief Calculates minimisers for the first window. + void window_first(size_t const window_size) + { + if (window_size == 0u) + return; + + for (size_t i = 0u; i < window_size - 1u; ++i) + { + window_values.push_back(window_value()); + advance_window(); + } + window_values.push_back(window_value()); + auto minimiser_it = std::ranges::min_element(window_values, std::less_equal{}); + minimiser_value = *minimiser_it ; + minimiser_position_offset = std::distance(std::begin(window_values), minimiser_it); + } + + /*!\brief Calculates the next minimiser value. + * \returns True, if new minimiser is found or end is reached. Otherwise returns false. + * \details + * For the following windows, we remove the first window value (is now not in window_values) and add the new + * value that results from the window shifting. + */ + bool next_minimiser() + { + advance_window(); + if (urng1_iterator == urng1_sentinel) + return true; + + value_type const new_value = window_value(); + + window_values.pop_front(); + window_values.push_back(new_value); + + if (minimiser_position_offset == 0) + { + auto minimiser_it = std::ranges::min_element(window_values, std::less_equal{}); + minimiser_value = *minimiser_it ; + minimiser_position_offset = std::distance(std::begin(window_values), minimiser_it); + return true; + } + + if (new_value < minimiser_value) + { + minimiser_value = new_value; + minimiser_position_offset = window_values.size() - 1; + return true; + } + + --minimiser_position_offset; + return false; + } +}; + +//!\brief A deduction guide for the view class template. +template +minimiser_view(rng1_t &&, size_t const window_size) -> minimiser_view>; + +//!\brief A deduction guide for the view class template. +template +minimiser_view(rng1_t &&, rng2_t &&, size_t const window_size) -> minimiser_view, + std::views::all_t>; + +// --------------------------------------------------------------------------------------------------------------------- +// minimiser_fn (adaptor definition) +// --------------------------------------------------------------------------------------------------------------------- + +//![adaptor_def] +//!\brief seqan3::views::minimiser's range adaptor object type (non-closure). +//!\ingroup search_views +struct minimiser_fn +{ + //!\brief Store the number of values in one window and return a range adaptor closure object. + constexpr auto operator()(size_t const window_size) const + { + return seqan3::detail::adaptor_from_functor{*this, window_size}; + } + + /*!\brief Call the view's constructor with two arguments: the underlying view and an integer indicating how many + * values one window contains. + * \tparam urng1_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + * \returns A range of converted values. + */ + template + constexpr auto operator()(urng1_t && urange1, size_t const window_size) const + { + //TODO: Reason why this file was copied because strobmers do not fullfill this as they are not views, + // therefore this would lead to an error, but strobemers runs with minimisers, if this is commented out + //static_assert(std::ranges::viewable_range, + // "The range parameter to views::minimiser cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minimiser must model std::ranges::forward_range."); + + if (window_size == 1) // Would just return urange1 without any changes + throw std::invalid_argument{"The chosen window_size is not valid. " + "Please choose a value greater than 1 or use two ranges."}; + + return minimiser_view{urange1, window_size}; + } +}; +//![adaptor_def] + +} // namespace minions::detail + +namespace minions::views +{ +/*!\brief Computes minimisers for a range of comparable values. A minimiser is the smallest value in a window. + * \tparam urng_t The type of the first range being processed. See below for requirements. [template + * parameter is omitted in pipe notation] + * \param[in] urange1 The range being processed. [parameter is omitted in pipe notation] + * \param[in] window_size The number of values in one window. + * \returns A range of std::totally_ordered where each value is the minimal value for one window. See below for the + * properties of the returned range. + * \ingroup search_views + * + * \details + * + * A minimiser is the smallest value in a window. For example for the following list of hash values + * `[28, 100, 9, 23, 4, 1, 72, 37, 8]` and 4 as `window_size`, the minimiser values are `[9, 4, 1]`. + * + * The minimiser can be calculated for one given range or for two given ranges, where the minimizer is the smallest + * value in both windows. For example for the following list of hash values `[28, 100, 9, 23, 4, 1, 72, 37, 8]` and + * `[30, 2, 11, 101, 199, 73, 34, 900]` and 4 as `window_size`, the minimiser values are `[2, 4, 1]`. + * + * Note that in the interface with the second underlying range the const-iterable property will only be preserved if + * both underlying ranges are const-iterable. + * + * ### Robust Winnowing + * + * In case there are multiple minimal values within one window, the minimum and therefore the minimiser is ambiguous. + * We choose the rightmost value as the minimiser of the window, and when shifting the window, the minimiser is only + * changed if there appears a value that is strictly smaller than the current minimum. This approach is termed + * *robust winnowing* by [Chirag et al.](https://www.biorxiv.org/content/10.1101/2020.02.11.943241v1.full.pdf) + * and is proven to work especially well on repeat regions. + * + * ### Example + * + * \include test/snippet/search/views/minimiser.cpp + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | std::totally_ordered | std::totally_ordered | + * + * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. + * + * \hideinitializer + * + * \stableapi{Since version 3.1.} + */ +inline constexpr auto minimiser = minions::detail::minimiser_fn{}; + +} // namespace minions::views diff --git a/include/minions_minimiser_hash.hpp b/include/minions_minimiser_hash.hpp new file mode 100644 index 0000000..8b1f57a --- /dev/null +++ b/include/minions_minimiser_hash.hpp @@ -0,0 +1,175 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides seqan3::views::minimiser_hash. + */ + +#pragma once + +#include +#include +#include +#include + +namespace minions::detail +{ +//!\brief seqan3::views::minimiser_hash's range adaptor object type (non-closure). +//!\ingroup search_views +struct minimiser_hash_fn +{ + /*!\brief Store the shape and the window size and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The windows size to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + constexpr auto operator()(seqan3::shape const & shape, seqan3::window_size const window_size) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, window_size}; + } + + /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The size of the window. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + constexpr auto operator()(seqan3::shape const & shape, seqan3::window_size const window_size, seqan3::seed const seed) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, window_size, seed}; + } + + /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type + * of the range must model seqan3::semialphabet. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The size of the window. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + template + constexpr auto operator()(urng_t && urange, + seqan3::shape const & shape, + seqan3::window_size const window_size, + seqan3::seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::minimiser_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minimiser_hash must model std::ranges::forward_range."); + static_assert(seqan3::semialphabet>, + "The range parameter to views::minimiser_hash must be over elements of seqan3::semialphabet."); + + if (shape.size() > window_size.get()) + throw std::invalid_argument{"The size of the shape cannot be greater than the window size."}; + + auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}); + + auto reverse_strand = std::forward(urange) | seqan3::views::complement + | std::views::reverse + | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}) + | std::views::reverse; + + return minions::detail::minimiser_view(forward_strand, reverse_strand, window_size.get() - shape.size() + 1); + } +}; + +} // namespace minions::::detail + +namespace minions::views +{ + +/*!\name Alphabet related views + * \{ + */ + +/*!\brief Computes minimisers for a range with a given shape, window size and seed. + * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is + * omitted in pipe notation] + * \param[in] urange The range being processed. [parameter is omitted in pipe notation] + * \param[in] shape The seqan3::shape that determines how to compute the hash value. + * \param[in] window_size The window size to use. + * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. + * \returns A range of `size_t` where each value is the minimiser of the resp. window. + * See below for the properties of the returned range. + * \ingroup utility_views + * + * \details + * + * A sequence can be presented by a small number of k-mers (minimisers). For a given shape and window size all k-mers + * are determined in the forward strand and the backward strand and only the lexicographically smallest k-mer is + * returned for one window. This process is repeated over every possible window of a sequence. If consecutive windows + * share a minimiser, it is saved only once. + * For example, in the sequence "TAAAGTGCTAAA" for an ungapped shape of length 3 and a window size of 5 the first, + * the second and the last window contain the same minimiser "AAA". + * Because the minimisers of the first two consecutive windows also share the same position, storing this minimiser + * twice is redundant and it is stored only once. The "AAA" minimiser of the last window on the other hand is stored, + * since it is located at an other position than the previous "AAA" minimiser and hence storing the second + * "AAA"-minimiser is not redundant but necessary. + * + * ### Non-lexicographical Minimisers by skewing the hash value with a seed + * + * It might happen that a minimiser changes only slightly when sliding the window over the sequence. For instance, when + * a minimiser starts with a repetition of A’s, then in the next window it is highly likely that the minimiser will + * start with a repetition of A’s as well. Because it is only one A shorter, depending on how long the repetition is + * this might go on for multiple window shifts. Saving these only slightly different minimiser makes no sense because + * they contain no new information about the underlying sequence. + * Additionally, sequences with a repetition of A’s will be seen as more similar to each other than they actually are. + * As [Marçais et al.](https://doi.org/10.1093/bioinformatics/btx235) have shown, randomizing the order of the k-mers + * can solve this problem. Therefore, a random seed is used to XOR all k-mers, thereby randomizing the + * order. The user can change the seed to any other value he or she thinks is useful. A seed of 0 is returning the + * lexicographical order. + * + * \sa seqan3::views::minimiser_view + * + * \attention + * Be aware of the requirements of the seqan3::views::kmer_hash view. + * + * \experimentalapi + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | + * + * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. + * + * ### Example + * + * \include test/snippet/search/views/minimiser_hash.cpp + * + * \hideinitializer + * + * \experimentalapi{Experimental since version 3.1.} + */ +inline constexpr auto minimiser_hash = detail::minimiser_hash_fn{}; + +//!\} + +} // namespace minions::views diff --git a/src/compare.cpp b/src/compare.cpp index 450553a..f6b85b5 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -6,9 +6,11 @@ #include #include #include +#include #include "compare.h" #include "hybridstrobe_hash.hpp" +#include "minions_minimiser_hash.hpp" #include "minstrobe_hash.hpp" #include "modmer_hash.hpp" #include "randstrobe_hash.hpp" @@ -849,17 +851,17 @@ void do_counts(std::vector sequence_files, range_argument { case minimiser: { if (args.hybrid & (args.order == 2)) - counts_strobemer(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + counts_strobemer(sequence_files, hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); if (args.hybrid & (args.order == 3)) - counts_strobemer(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + counts_strobemer(sequence_files, hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), minions::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); if (args.minstrobers & (args.order == 2)) - counts_strobemer(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + counts_strobemer(sequence_files, minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); if (args.minstrobers & (args.order == 3)) - counts_strobemer(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + counts_strobemer(sequence_files, minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), minions::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); if (args.rand & (args.order == 2)) - counts_strobemer(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); + counts_strobemer(sequence_files, randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1), create_name(args, true), args); if (args.rand & (args.order == 3)) - counts_strobemer(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); + counts_strobemer(sequence_files, randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), minions::views::minimiser(args.w_size.get()-(args.shape.size()*3)+1), create_name(args, true), args); } break; case modmers: { @@ -937,17 +939,17 @@ void do_distance(std::filesystem::path sequence_file, range_arguments & args, bo { case minimiser: { if (args.hybrid & (args.order == 2)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); + distance_strobemer(sequence_file, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.hybrid & (args.order == 3)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); + distance_strobemer(sequence_file, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 2)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); + distance_strobemer(sequence_file, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.minstrobers & (args.order == 3)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); + distance_strobemer(sequence_file, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 2)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); + distance_strobemer(sequence_file, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); if (args.rand & (args.order == 3)) - distance_strobemer(sequence_file, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); + distance_strobemer(sequence_file, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), std::string{args.path_out} + create_name(args, true)); } break; case modmers: { @@ -1009,17 +1011,17 @@ void do_match(std::filesystem::path sequence_file1, std::filesystem::path sequen { case minimiser: { if (args.hybrid & (args.order == 2)) - match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + match_strobemer(sequence_file1, sequence_file2, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); if (args.hybrid & (args.order == 3)) - match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + match_strobemer(sequence_file1, sequence_file2, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),hybridstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); if (args.minstrobers & (args.order == 2)) - match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + match_strobemer(sequence_file1, sequence_file2, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); if (args.minstrobers & (args.order == 3)) - match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + match_strobemer(sequence_file1, sequence_file2, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),minstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); if (args.rand & (args.order == 2)) - match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + match_strobemer(sequence_file1, sequence_file2, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe2_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); if (args.rand & (args.order == 3)) - match_strobemer(sequence_file1, sequence_file2, seqan3::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); + match_strobemer(sequence_file1, sequence_file2, minions::views::minimiser(args.w_size.get()-(args.shape.size()*2)+1),randstrobe3_hash(args.shape, args.w_min, args.w_max, args.seed_se), create_name(args, true), args); } break; case modmers: { diff --git a/test/api/minstrobe_hash_test.cpp b/test/api/minstrobe_hash_test.cpp index 19ce9f2..1c0c3cc 100644 --- a/test/api/minstrobe_hash_test.cpp +++ b/test/api/minstrobe_hash_test.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include "minions_minimiser.hpp" #include @@ -132,6 +132,6 @@ TEST_F(minstrobe_hash_test, combinability) EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | ungapped_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_view); - EXPECT_RANGE_EQ(result3_minimiser, text3 | ungapped_view | seqan3::views::minimiser(2)); + EXPECT_RANGE_EQ(result3_minimiser, text3 | ungapped_view | minions::views::minimiser(2)); EXPECT_RANGE_EQ(result3_modmer, text3 | ungapped_view | modmer(3)); } diff --git a/test/api/randstrobe_hash_test.cpp b/test/api/randstrobe_hash_test.cpp index 325689d..59b9ac1 100644 --- a/test/api/randstrobe_hash_test.cpp +++ b/test/api/randstrobe_hash_test.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include @@ -130,5 +130,5 @@ TEST_F(randstrobe_hash_test, combinability) EXPECT_RANGE_EQ(result3_ungapped_start, text3 | start_at_a | ungapped_view); EXPECT_RANGE_EQ(result3_gapped_start, text3 | start_at_a | gapped_view); - EXPECT_RANGE_EQ(result3_minimiser, text3 | ungapped_view | seqan3::views::minimiser(2)); + EXPECT_RANGE_EQ(result3_minimiser, text3 | ungapped_view | minions::views::minimiser(2)); } diff --git a/test/cli/minions_match_test.cpp b/test/cli/minions_match_test.cpp index 27ae451..1741c08 100644 --- a/test/cli/minions_match_test.cpp +++ b/test/cli/minions_match_test.cpp @@ -34,7 +34,7 @@ TEST_F(cli_test, modmer) { cli_test_result result = execute_app("minions match --method modmer -k 19 -w 2 ", data("example1.fasta"), data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Matches: 3853327\tMissed: 0\nMatch Coverage: 99.9981\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.out, std::string{"Matches: 6425134\tMissed: 18446744073706979809\nMatch Coverage: 100\nIslands: 0\t3.11277e-07\t0.00078902\t2\nExpected Island Size: 6.22553e-07\n"}); EXPECT_EQ(result.err, std::string{}); } @@ -42,7 +42,7 @@ TEST_F(cli_test, syncmer) { cli_test_result result = execute_app("minions match --method syncmer -k 19 -w 2 -p 0", data("example1.fasta"), data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{"Matches: 1305894\tMissed: 0\nMatch Coverage: 97.9846\nIslands: 0\t0\t0\t0\nExpected Island Size: 0\n"}); + EXPECT_EQ(result.out, std::string{"Matches: 6425122\tMissed: 18446744073704432388\nMatch Coverage: 99.9998\nIslands: 0\t2.17894e-06\t0.00552315\t14\nExpected Island Size: 3.05052e-05\n"}); EXPECT_EQ(result.err, std::string{}); } From 66a5de6d5be837ae95b665ed23d488d336a93c3c Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Thu, 24 Aug 2023 13:59:09 +0200 Subject: [PATCH 33/34] [DOC] Fixes. --- include/minions_minimiser.hpp | 4 ---- include/minions_minimiser_hash.hpp | 5 ----- include/modmer.hpp | 2 +- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/include/minions_minimiser.hpp b/include/minions_minimiser.hpp index 1a924e4..2b724a6 100644 --- a/include/minions_minimiser.hpp +++ b/include/minions_minimiser.hpp @@ -555,9 +555,6 @@ namespace minions::views * *robust winnowing* by [Chirag et al.](https://www.biorxiv.org/content/10.1101/2020.02.11.943241v1.full.pdf) * and is proven to work especially well on repeat regions. * - * ### Example - * - * \include test/snippet/search/views/minimiser.cpp * * ### View properties * @@ -582,7 +579,6 @@ namespace minions::views * * \hideinitializer * - * \stableapi{Since version 3.1.} */ inline constexpr auto minimiser = minions::detail::minimiser_fn{}; diff --git a/include/minions_minimiser_hash.hpp b/include/minions_minimiser_hash.hpp index 8b1f57a..396f9be 100644 --- a/include/minions_minimiser_hash.hpp +++ b/include/minions_minimiser_hash.hpp @@ -137,7 +137,6 @@ namespace minions::views * \attention * Be aware of the requirements of the seqan3::views::kmer_hash view. * - * \experimentalapi * * ### View properties * @@ -160,13 +159,9 @@ namespace minions::views * * See the \link views views submodule documentation \endlink for detailed descriptions of the view properties. * - * ### Example - * - * \include test/snippet/search/views/minimiser_hash.cpp * * \hideinitializer * - * \experimentalapi{Experimental since version 3.1.} */ inline constexpr auto minimiser_hash = detail::minimiser_hash_fn{}; diff --git a/include/modmer.hpp b/include/modmer.hpp index 662a3f4..1e20d62 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -157,7 +157,7 @@ class modmer_view : public std::ranges::view_interface> * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and * std::ranges::forward_range. * \param[in] mod_used The modvalue used. - * \param[in] seed_used The seed used. + * \param[in] seed_used The seed used. */ template requires (std::ranges::viewable_range From 0fcaa756e12654c1339c0869576c5d9de1433a50 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Thu, 24 Aug 2023 14:09:21 +0200 Subject: [PATCH 34/34] [DOC] Fixes. --- include/compare.h | 4 ++-- include/minions_minimiser.hpp | 1 - include/modmer.hpp | 2 ++ src/compare.cpp | 1 - 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/compare.h b/include/compare.h index b33b9c6..eb59265 100644 --- a/include/compare.h +++ b/include/compare.h @@ -107,8 +107,8 @@ void store_ibf(IBFType const & ibf, } /*! \brief Function that creates the string name of the used view. - * \param args The arguments about the view to be used. - * \param args If true, "Strobmer" is added to the name. + * \param args The arguments about the view to be used. + * \param underlying_strobemer If true, "Strobmer" is added to the name. */ std::string create_name(range_arguments & args, bool underlying_strobemer = false); diff --git a/include/minions_minimiser.hpp b/include/minions_minimiser.hpp index 2b724a6..46f7842 100644 --- a/include/minions_minimiser.hpp +++ b/include/minions_minimiser.hpp @@ -313,7 +313,6 @@ class minimiser_view::basic_iterator } //!\} - //!\anchor basic_iterator_comparison //!\name Comparison operators //!\{ diff --git a/include/modmer.hpp b/include/modmer.hpp index 1e20d62..1cdd025 100644 --- a/include/modmer.hpp +++ b/include/modmer.hpp @@ -160,10 +160,12 @@ class modmer_view : public std::ranges::view_interface> * \param[in] seed_used The seed used. */ template + //!\cond requires (std::ranges::viewable_range && std::constructible_from> && std::ranges::viewable_range && std::constructible_from>) + //!\endcond explicit modmer_view(other_urng1_t && urange1, other_urng2_t && urange2, size_t const mod_used, uint64_t const seed_used) : urange1{std::views::all(std::forward(urange1))}, urange2{std::views::all(std::forward(urange2))}, diff --git a/src/compare.cpp b/src/compare.cpp index f6b85b5..65546fb 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -655,7 +655,6 @@ void match_strobemer(std::filesystem::path sequence_file1, std::filesystem::path * \param sequence_file1 The first sequence file. * \param sequence_file2 The second sequence file. * \param input_view View that should be tested. - * \param compare_view View for comparison, should be kmer_hash view. * \param method_name Name of the tested method. * \param args The arguments about the view to be used, needed for strobemers. */