From b786395db2b38ca2e9b7e8372b15fc5d20f33bb6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 16:24:09 -0800 Subject: [PATCH 01/23] xcounts: adding code to get a header from a fasta reference genome file --- src/utils/xcounts.cpp | 57 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index a1ffe46a..45964952 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -38,6 +38,7 @@ using std::runtime_error; using std::string; using std::to_chars; using std::vector; +using std::to_string; using bamxx::bgzf_file; @@ -75,10 +76,55 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) { } +static void +get_chrom_sizes(const uint32_t n_threads, const string &filename, + vector &chrom_names, vector &chrom_sizes) { + + bamxx::bam_tpool tpool(n_threads); + + bgzf_file in(filename, "r"); + if (!in) throw runtime_error("could not open file: " + filename); + if (n_threads > 1) + tpool.set_io(in); + + chrom_names.clear(); + chrom_sizes.clear(); + + // use the kstring_t type to more directly use the BGZF file + kstring_t line{0, 0, nullptr}; + const int ret = ks_resize(&line, 1024); + if (ret) throw runtime_error("failed to acquire buffer"); + + uint64_t chrom_size = 0; + while (getline(in, line)) { + if (line.s[0] == '>') { + if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + chrom_names.emplace_back(line.s + 1); + chrom_size = 0; + } + else chrom_size += line.l; + } + if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + + assert(size(chrom_names) == size(chrom_sizes)); +} + +static void +write_header(const vector &chrom_names, + const vector &chrom_sizes, bgzf_file &out) { + for (auto i = 0u; i < size(chrom_sizes); ++i) { + const string tmp = + "# " + chrom_names[i] + "\t" + to_string(chrom_sizes[i]) + "\n"; + out.write(tmp.c_str()); + } + out.write("# end_header\n"); +} + int main_xcounts(int argc, const char **argv) { try { size_t n_threads = 1; + string genome_file; string outfile{"-"}; const string description = @@ -89,6 +135,8 @@ main_xcounts(int argc, const char **argv) { " (\"-\" for standard input)", 1); opt_parse.add_opt("output", 'o', "output file (default is standard out)", false, outfile); + opt_parse.add_opt("chroms", 'c', "make header from this reference", + false, genome_file); opt_parse.add_opt("threads", 't', "threads for compression (use few)", false, n_threads); std::vector leftover_args; @@ -113,6 +161,11 @@ main_xcounts(int argc, const char **argv) { const string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ + vector chrom_names; + vector chrom_sizes; + if (!genome_file.empty()) + get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes); + bamxx::bam_tpool tpool(n_threads); bgzf_file in(filename, "r"); @@ -129,6 +182,9 @@ main_xcounts(int argc, const char **argv) { tpool.set_io(out); } + if (!genome_file.empty()) + write_header(chrom_names, chrom_sizes, out); + // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); @@ -144,6 +200,7 @@ main_xcounts(int argc, const char **argv) { MSite site; while (status_ok && getline(in, line)) { if (line.s[0] == '#') { + if (!genome_file.empty()) continue; const auto header_line = string(line.s) + "\n"; sz = size(header_line); status_ok = bgzf_write(out.f, header_line.data(), sz) == sz; From 09eea218c470c9eb36d385fa502fdb9ed844bdc0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 16:56:00 -0800 Subject: [PATCH 02/23] xcounts: testing some threading issues --- src/utils/xcounts.cpp | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index 45964952..da14d495 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -77,35 +78,43 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) { static void -get_chrom_sizes(const uint32_t n_threads, const string &filename, +get_chrom_sizes(//const uint32_t n_threads, + const string &filename, vector &chrom_names, vector &chrom_sizes) { - bamxx::bam_tpool tpool(n_threads); + // bamxx::bam_tpool tpool(n_threads); - bgzf_file in(filename, "r"); + std::ifstream in(filename); + // bgzf_file in(filename, "r"); if (!in) throw runtime_error("could not open file: " + filename); - if (n_threads > 1) - tpool.set_io(in); + // if (n_threads > 1) + // tpool.set_io(in); chrom_names.clear(); chrom_sizes.clear(); // use the kstring_t type to more directly use the BGZF file - kstring_t line{0, 0, nullptr}; - const int ret = ks_resize(&line, 1024); - if (ret) throw runtime_error("failed to acquire buffer"); + string line; + // kstring_t line{0, 0, nullptr}; + // const int ret = ks_resize(&line, 1024); + // if (ret) throw runtime_error("failed to acquire buffer"); uint64_t chrom_size = 0; while (getline(in, line)) { - if (line.s[0] == '>') { + // if (line.s[0] == '>') { + if (line[0] == '>') { if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); - chrom_names.emplace_back(line.s + 1); + // chrom_names.emplace_back(line.s + 1); + chrom_names.emplace_back(line.substr(1)); chrom_size = 0; } - else chrom_size += line.l; + // else chrom_size += line.l; + else chrom_size += size(line); } if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + // ks_free(&line); + assert(size(chrom_names) == size(chrom_sizes)); } @@ -164,7 +173,8 @@ main_xcounts(int argc, const char **argv) { vector chrom_names; vector chrom_sizes; if (!genome_file.empty()) - get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes); + get_chrom_sizes(//n_threads, + genome_file, chrom_names, chrom_sizes); bamxx::bam_tpool tpool(n_threads); @@ -223,6 +233,8 @@ main_xcounts(int argc, const char **argv) { offset = site.pos; } } + ks_free(&line); + if (!status_ok) { cerr << "failed converting " << filename << " to " << outfile << endl; From 048408e407fe40a5a6c2b39c8a300af403c33ef6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 17:24:24 -0800 Subject: [PATCH 03/23] several source files: checking that input file is BGZF format before using the threadpool for decompression --- src/amrfinder/amrfinder.cpp | 4 ++-- src/analysis/methstates.cpp | 9 +++++--- src/utils/covered.cpp | 5 +++-- src/utils/recovered.cpp | 2 +- src/utils/symmetric-cpgs.cpp | 2 +- src/utils/unxcounts.cpp | 2 +- src/utils/xcounts.cpp | 40 +++++++++++++++--------------------- 7 files changed, 31 insertions(+), 33 deletions(-) diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp index 4ecb4812..11cb476a 100644 --- a/src/amrfinder/amrfinder.cpp +++ b/src/amrfinder/amrfinder.cpp @@ -472,10 +472,10 @@ main_amrfinder(int argc, const char **argv) { const EpireadStats epistat{low_prob, high_prob, critical_value, max_itr, use_bic, correct_for_read_count}; - // bamxx::bam_tpool tp(n_threads); + bamxx::bam_tpool tp(n_threads); bgzf_file in(reads_file, "r"); if (!in) throw runtime_error("failed to open input file: " + reads_file); - // if (n_threads > 1) tp.set_io(in); + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); vector amrs; diff --git a/src/analysis/methstates.cpp b/src/analysis/methstates.cpp index 9133bc75..18e28572 100644 --- a/src/analysis/methstates.cpp +++ b/src/analysis/methstates.cpp @@ -254,14 +254,17 @@ main_methstates(int argc, const char **argv) { bamxx::bam_header hdr(in); if (!hdr) throw dnmt_error("cannot read heade" + mapped_reads_file); - /* set the threads for the input file decompression */ - if (n_threads > 1) tp.set_io(in); - // open the output file const string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); if (!out) throw dnmt_error("error opening output file: " + outfile); + /* set the threads for the input file decompression */ + if (n_threads > 1) { + tp.set_io(in); + tp.set_io(out); + } + vector cpgs; unordered_set chroms_seen; diff --git a/src/utils/covered.cpp b/src/utils/covered.cpp index b96aca56..26efbc75 100644 --- a/src/utils/covered.cpp +++ b/src/utils/covered.cpp @@ -113,14 +113,15 @@ main_covered(int argc, const char **argv) { bgzf_file in(filename, "r"); if (!in) throw runtime_error("could not open file: " + filename); - const auto outfile_mode = in.compression() ? "w" : "wu"; + const auto outfile_mode = in.is_compressed() ? "w" : "wu"; bgzf_file out(outfile, outfile_mode); if (!out) throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { // ADS: something breaks when we use the thread for the input - tpool.set_io(in); + if (in.is_bgzf()) + tpool.set_io(in); tpool.set_io(out); } diff --git a/src/utils/recovered.cpp b/src/utils/recovered.cpp index 60b9a62f..0d30906e 100644 --- a/src/utils/recovered.cpp +++ b/src/utils/recovered.cpp @@ -57,7 +57,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, bamxx::bam_tpool tp(n_threads); // set the threads for the input file decompression - if (n_threads > 1) + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); unordered_set chroms_seen; diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index 0c3844ee..a69c0a7d 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -174,7 +174,7 @@ main_symmetric_cpgs(int argc, const char **argv) { if (!out) throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { - tp.set_io(in); + if (in.is_bgzf()) tp.set_io(in); tp.set_io(out); } diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index d78cf4e7..1229c279 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -85,7 +85,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, bamxx::bam_tpool tp(n_threads); // set the threads for the input file decompression - if (n_threads > 1) + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); unordered_set chroms_seen; diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index da14d495..1585ef0c 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -78,42 +77,37 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) { static void -get_chrom_sizes(//const uint32_t n_threads, +get_chrom_sizes(const uint32_t n_threads, const string &filename, vector &chrom_names, vector &chrom_sizes) { - // bamxx::bam_tpool tpool(n_threads); + bamxx::bam_tpool tpool(n_threads); - std::ifstream in(filename); - // bgzf_file in(filename, "r"); + bgzf_file in(filename, "r"); if (!in) throw runtime_error("could not open file: " + filename); - // if (n_threads > 1) - // tpool.set_io(in); + if (n_threads > 1 && in.is_bgzf()) + tpool.set_io(in); chrom_names.clear(); chrom_sizes.clear(); // use the kstring_t type to more directly use the BGZF file - string line; - // kstring_t line{0, 0, nullptr}; - // const int ret = ks_resize(&line, 1024); - // if (ret) throw runtime_error("failed to acquire buffer"); + kstring_t line{0, 0, nullptr}; + const int ret = ks_resize(&line, 1024); + if (ret) throw runtime_error("failed to acquire buffer"); uint64_t chrom_size = 0; while (getline(in, line)) { - // if (line.s[0] == '>') { - if (line[0] == '>') { + if (line.s[0] == '>') { if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); - // chrom_names.emplace_back(line.s + 1); - chrom_names.emplace_back(line.substr(1)); + chrom_names.emplace_back(line.s + 1); chrom_size = 0; } - // else chrom_size += line.l; - else chrom_size += size(line); + else chrom_size += line.l; } if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); - // ks_free(&line); + ks_free(&line); assert(size(chrom_names) == size(chrom_sizes)); } @@ -173,22 +167,22 @@ main_xcounts(int argc, const char **argv) { vector chrom_names; vector chrom_sizes; if (!genome_file.empty()) - get_chrom_sizes(//n_threads, - genome_file, chrom_names, chrom_sizes); + get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes); bamxx::bam_tpool tpool(n_threads); bgzf_file in(filename, "r"); if (!in) throw runtime_error("could not open file: " + filename); - const auto outfile_mode = in.compression() ? "w" : "wu"; + const auto outfile_mode = in.is_compressed() ? "w" : "wu"; bgzf_file out(outfile, outfile_mode); if (!out) throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { // ADS: something breaks when we use the thread for the input - tpool.set_io(in); + if (in.is_bgzf()) + tpool.set_io(in); tpool.set_io(out); } @@ -208,7 +202,7 @@ main_xcounts(int argc, const char **argv) { int64_t sz = 0; MSite site; - while (status_ok && getline(in, line)) { + while (status_ok && (status_ok = bool(getline(in, line)))) { if (line.s[0] == '#') { if (!genome_file.empty()) continue; const auto header_line = string(line.s) + "\n"; From f658558a6b3d256dd64e22738ef7c9907262d495 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 17:28:41 -0800 Subject: [PATCH 04/23] bamxx submodule update --- src/bamxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bamxx b/src/bamxx index 09cfe360..7c2228c2 160000 --- a/src/bamxx +++ b/src/bamxx @@ -1 +1 @@ -Subproject commit 09cfe360be875271014b2824fecd8b732e640630 +Subproject commit 7c2228c2f0233e9df1f1534f374e08c0a196ae55 From 5b3568a6d85d2fe9dec0a66f57d42b216cf900e3 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 17:36:39 -0800 Subject: [PATCH 05/23] xcounts: fixing a bug from previous commit --- src/utils/xcounts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index 1585ef0c..582fc7f6 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -202,7 +202,7 @@ main_xcounts(int argc, const char **argv) { int64_t sz = 0; MSite site; - while (status_ok && (status_ok = bool(getline(in, line)))) { + while (status_ok && getline(in, line)) { if (line.s[0] == '#') { if (!genome_file.empty()) continue; const auto header_line = string(line.s) + "\n"; From f50caba10e512373a5cab2620ed6362226f69591 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 17:56:12 -0800 Subject: [PATCH 06/23] unxcounts: including logic to verify header against chromosomes sequences currently just using length --- src/utils/unxcounts.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 1229c279..c08e5908 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -271,6 +271,24 @@ get_chrom_idx(const unordered_map &name_to_idx, return chrom_idx->second; } + +static bool +verify_chrom(const string &header_line, + const unordered_map &name_to_idx, + vector &chrom_sizes) { + std::istringstream iss(header_line.substr(1)); + string name; + uint64_t chrom_size = 0; + if (!(iss >> name >> chrom_size)) + return false; + + const auto idx = name_to_idx.find(name); + if (idx == cend(name_to_idx)) return false; + + return chrom_size == chrom_sizes[idx->second]; +} + + static void process_sites(const bool verbose, const bool add_missing_chroms, const bool compress_output, const size_t n_threads, @@ -309,7 +327,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, // set the threads for the input file decompression if (n_threads > 1) { - tp.set_io(in); + if (in.is_bgzf()) tp.set_io(in); tp.set_io(out); } @@ -329,6 +347,9 @@ process_sites(const bool verbose, const bool add_missing_chroms, while (getline(in, line)) { if (line.s[0] == '#') { + string header_line{line.s}; + if (!verify_chrom(header_line, name_to_idx, chrom_sizes)) + throw runtime_error("failed to convert"); line.s[line.l++] = '\n'; if (bgzf_write(out.f, line.s, line.l) != static_cast(line.l)) throw runtime_error("failed to convert"); From 16a3fdc07e73113831cce77f95baac4424aa8048 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 19:37:25 -0800 Subject: [PATCH 07/23] unxcounts: adding logic to only generate the counts for covered sites --- src/utils/unxcounts.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index c08e5908..2db7ab23 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -291,6 +291,7 @@ verify_chrom(const string &header_line, static void process_sites(const bool verbose, const bool add_missing_chroms, + const bool require_covered, const bool compress_output, const size_t n_threads, const string &infile, const string &outfile, const string &chroms_file) { @@ -390,14 +391,15 @@ process_sites(const bool verbose, const bool add_missing_chroms, res = from_chars(res.ptr + 1, end_line, n_unmeth); const auto curr_pos = pos + pos_step; - if (pos + 1 < curr_pos) + if (require_covered && pos + 1 < curr_pos) write_missing_sites(chrom_name, *chrom_itr, pos + 1, curr_pos, buf, out); write_site(chrom_name, *chrom_itr, curr_pos, n_meth, n_unmeth, buf, out); pos = curr_pos; } } - write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); + if (require_covered) + write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); if (add_missing_chroms) { const int32_t chrom_idx = size(chroms); @@ -418,6 +420,7 @@ main_unxcounts(int argc, const char **argv) { bool verbose = false; bool add_missing_chroms = false; + bool require_covered = false; bool compress_output = false; size_t n_threads = 1; @@ -431,6 +434,8 @@ main_unxcounts(int argc, const char **argv) { ""); opt_parse.add_opt("output", 'o', "output file (required)", true, outfile); opt_parse.add_opt("missing", 'm', "add missing chroms", false, add_missing_chroms); + opt_parse.add_opt("reads", 'r', "report only sites with reads", false, + require_covered); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", true , chroms_file); @@ -455,11 +460,20 @@ main_unxcounts(int argc, const char **argv) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; } + if (require_covered && add_missing_chroms) { + cerr << "options mutually exclusive: reads and missing" << endl; + return EXIT_FAILURE; + } const string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - process_sites(verbose, add_missing_chroms, compress_output, n_threads, - filename, outfile, chroms_file); + if (require_covered && add_missing_chroms) { + cerr << "options mutually exclusive: reads and missing" << endl; + return EXIT_FAILURE; + } + + process_sites(verbose, add_missing_chroms, require_covered, compress_output, + n_threads, filename, outfile, chroms_file); } catch (const std::runtime_error &e) { cerr << e.what() << endl; From 2583dc8b11d4fdc87edc380962691e5dee96d4e1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 19:39:28 -0800 Subject: [PATCH 08/23] xcounts: ignoring the mutation mark on sites --- src/utils/xcounts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index 582fc7f6..98432c85 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -221,7 +221,7 @@ main_xcounts(int argc, const char **argv) { sz = size(site.chrom); status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz; } - if (site.n_reads > 0 || site.is_mutated()) { + if (site.n_reads > 0) { sz = fill_output_buffer(offset, site, buf); status_ok = bgzf_write(out.f, buf.data(), sz) == sz; offset = site.pos; From 84846cee8c3d4d7d96d64216b27e37ba9fb45bbf Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 19:42:47 -0800 Subject: [PATCH 09/23] unxcounts: better error message on inconsistent header --- src/utils/unxcounts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 2db7ab23..1abf49f7 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -350,7 +350,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, if (line.s[0] == '#') { string header_line{line.s}; if (!verify_chrom(header_line, name_to_idx, chrom_sizes)) - throw runtime_error("failed to convert"); + throw runtime_error("failed to verify header for: " + header_line); line.s[line.l++] = '\n'; if (bgzf_write(out.f, line.s, line.l) != static_cast(line.l)) throw runtime_error("failed to convert"); From 3edadb1d316b9d369307968501c83b9e81176a5b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 19:44:39 -0800 Subject: [PATCH 10/23] unxcounts: trying to find the best header termination --- src/utils/unxcounts.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 1abf49f7..160e7a4f 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -349,6 +349,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, while (getline(in, line)) { if (line.s[0] == '#') { string header_line{line.s}; + if (header_line == "# end_header") continue; if (!verify_chrom(header_line, name_to_idx, chrom_sizes)) throw runtime_error("failed to verify header for: " + header_line); line.s[line.l++] = '\n'; From fd39ce8007f45663c6f5922799a35dbab17e5768 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 20:00:44 -0800 Subject: [PATCH 11/23] unxcounts: better handling of header --- src/utils/unxcounts.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 160e7a4f..3903701c 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -50,6 +50,7 @@ using std::cbegin; using std::cend; using std::copy; using std::copy_n; +using std::to_string; using bamxx::bgzf_file; @@ -349,12 +350,12 @@ process_sites(const bool verbose, const bool add_missing_chroms, while (getline(in, line)) { if (line.s[0] == '#') { string header_line{line.s}; - if (header_line == "# end_header") continue; if (!verify_chrom(header_line, name_to_idx, chrom_sizes)) throw runtime_error("failed to verify header for: " + header_line); line.s[line.l++] = '\n'; - if (bgzf_write(out.f, line.s, line.l) != static_cast(line.l)) - throw runtime_error("failed to convert"); + const int64_t ret = bgzf_write(out.f, line.s, line.l); + if (ret != static_cast(line.l)) + throw runtime_error("failed to convert: " + to_string(ret)); continue; } From 6db3e530ddd3aa914f3996e1b39cef20e8b49145 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 20:01:15 -0800 Subject: [PATCH 12/23] xcounts and counts: header no longer ends with end_header and also there is no space after the # in header lines --- src/analysis/methcounts.cpp | 3 +-- src/utils/xcounts.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp index 76aeaefd..ea935799 100644 --- a/src/analysis/methcounts.cpp +++ b/src/analysis/methcounts.cpp @@ -388,10 +388,9 @@ write_header(const bam_header &hdr, bgzf_file &out) { const size_t tid_size = sam_hdr_tid2len(hdr, i); const string tid_name = sam_hdr_tid2name(hdr, i); std::ostringstream oss; - oss << "# " << tid_name << ' ' << tid_size << '\n'; + oss << "#" << tid_name << ' ' << tid_size << '\n'; out.write(oss.str()); } - out.write("# end_header\n"); } template static void diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index 98432c85..3a2dc912 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -117,10 +117,9 @@ write_header(const vector &chrom_names, const vector &chrom_sizes, bgzf_file &out) { for (auto i = 0u; i < size(chrom_sizes); ++i) { const string tmp = - "# " + chrom_names[i] + "\t" + to_string(chrom_sizes[i]) + "\n"; + "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n"; out.write(tmp.c_str()); } - out.write("# end_header\n"); } int From 76ecc8cb204a0c27897bd38ada191c3c26886164 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 20:07:11 -0800 Subject: [PATCH 13/23] unxcounts: fixing backwards condition for requiring coverage --- src/utils/unxcounts.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 3903701c..c6ddd4c6 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -393,14 +393,14 @@ process_sites(const bool verbose, const bool add_missing_chroms, res = from_chars(res.ptr + 1, end_line, n_unmeth); const auto curr_pos = pos + pos_step; - if (require_covered && pos + 1 < curr_pos) + if (!require_covered && pos + 1 < curr_pos) write_missing_sites(chrom_name, *chrom_itr, pos + 1, curr_pos, buf, out); write_site(chrom_name, *chrom_itr, curr_pos, n_meth, n_unmeth, buf, out); pos = curr_pos; } } - if (require_covered) + if (!require_covered) write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); if (add_missing_chroms) { From 944e6606da5cff9e802ef9cec14d46fc86b66822 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 8 Dec 2023 20:18:16 -0800 Subject: [PATCH 14/23] unxcounts: adding a missed spot to check for required coverage --- src/utils/unxcounts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index c6ddd4c6..7276a964 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -361,7 +361,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, if (!std::isdigit(line.s[0])) { - if (pos != num_lim::max()) + if (!require_covered && pos != num_lim::max()) write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); chrom_name = string{line.s}; From 0e329e90da30697db72c56039132c704edd581fd Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:05:00 -0800 Subject: [PATCH 15/23] counts_header.cpp counts_header.hpp Makefile.am: adding counts_header stuff --- Makefile.am | 2 + src/common/counts_header.cpp | 163 +++++++++++++++++++++++++++++++++++ src/common/counts_header.hpp | 65 ++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 src/common/counts_header.cpp create mode 100644 src/common/counts_header.hpp diff --git a/Makefile.am b/Makefile.am index 368a8d54..e30d299d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -145,6 +145,7 @@ noinst_LIBRARIES = libdnmtools.a libdnmtools_a_SOURCES = \ src/common/dnmtools_gaussinv.cpp \ src/common/bam_record_utils.cpp \ + src/common/counts_header.cpp \ src/common/BetaBin.cpp \ src/common/EmissionDistribution.cpp \ src/common/Epiread.cpp \ @@ -163,6 +164,7 @@ libdnmtools_a_SOURCES += \ src/bamxx/bamxx.hpp \ src/common/dnmtools_gaussinv.hpp \ src/common/bam_record_utils.hpp \ + src/common/counts_header.hpp \ src/common/BetaBin.hpp \ src/common/EmissionDistribution.hpp \ src/common/Epiread.hpp \ diff --git a/src/common/counts_header.cpp b/src/common/counts_header.cpp new file mode 100644 index 00000000..e6a814b0 --- /dev/null +++ b/src/common/counts_header.cpp @@ -0,0 +1,163 @@ +/* xcounts_utils: code for doing things with xcounts format and some + * for counts format that is common to several tools. + * + * Copyright (C) 2023 Andrew D. Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "counts_header.hpp" + +#include +#include +#include +#include +#include + +#include "bam_record_utils.hpp" + +// generated by autotools +#include + +#include "bamxx.hpp" + +using std::vector; +using std::string; +using std::to_string; + +using bamxx::bgzf_file; + +void +write_counts_header_from_chom_sizes(const vector &chrom_names, + const vector &chrom_sizes, + bgzf_file &out) { + const auto version = "#DNMTOOLS " + string(VERSION) + "\n"; + out.write(version.c_str()); + for (auto i = 0u; i < size(chrom_sizes); ++i) { + const string tmp = + "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n"; + out.write(tmp.c_str()); + } + out.write("#\n"); +} + + +inline bgzf_file & +getline(bgzf_file &file, kstring_t &line) { + if (file.f == nullptr) return file; + const int x = bgzf_getline(file.f, '\n', &line); + if (x == -1) { + file.destroy(); + free(line.s); + line = {0, 0, nullptr}; + } + if (x < -1) { + // ADS: this is an error condition and should be handled + // differently from the EOF above. + file.destroy(); + free(line.s); + line = {0, 0, nullptr}; + } + return file; +} + + +bamxx::bgzf_file & +skip_counts_header(bamxx::bgzf_file &in) { + + // use the kstring_t type to more directly use the BGZF file + kstring_t line{0, 0, nullptr}; + const int ret = ks_resize(&line, 1024); + if (ret) return in; + + while (getline(in, line) && line.s[0] == '#') { + if (line.s[0] == '#' && line.l == 1) + return in; + } + // otherwise we have missed the final line of the header + assert(line.s[0] != '#'); + return in; +} + + +int +get_chrom_sizes_for_counts_header(const uint32_t n_threads, + const string &filename, + vector &chrom_names, + vector &chrom_sizes) { + + bamxx::bam_tpool tpool(n_threads); + + bgzf_file in(filename, "r"); + if (!in) return -1; + if (n_threads > 1 && in.is_bgzf()) + tpool.set_io(in); + + chrom_names.clear(); + chrom_sizes.clear(); + + // use the kstring_t type to more directly use the BGZF file + kstring_t line{0, 0, nullptr}; + const int ret = ks_resize(&line, 1024); + if (ret) return -1; + + uint64_t chrom_size = 0; + while (getline(in, line)) { + if (line.s[0] == '>') { + if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + chrom_names.emplace_back(line.s + 1); + chrom_size = 0; + } + else chrom_size += line.l; + } + if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + + ks_free(&line); + + assert(size(chrom_names) == size(chrom_sizes)); + + return 0; +} + + +void +write_counts_header_from_bam_header(const bamxx::bam_header &hdr, + bgzf_file &out) { + const auto version = "#DNMTOOLS " + string(VERSION) + "\n"; + out.write(version.c_str()); + for (auto i = 0; i < hdr.h->n_targets; ++i) { + const size_t tid_size = sam_hdr_tid2len(hdr, i); + const string tid_name = sam_hdr_tid2name(hdr, i); + std::ostringstream oss; + oss << "#" << tid_name << ' ' << tid_size << '\n'; + out.write(oss.str()); + } + out.write("#\n"); +} + + +bool +write_counts_header_line(string line, bgzf_file &out) { + line += '\n'; + const int64_t ret = bgzf_write(out.f, line.data(), size(line)); + return ret == static_cast(size(line)); +} + +bool +has_counts_header(const string &filename) { + bgzf_file in(filename, "r"); + if (!in) return false; + string line; + if (!getline(in, line)) return false; + return line[0] == '#'; +} diff --git a/src/common/counts_header.hpp b/src/common/counts_header.hpp new file mode 100644 index 00000000..171eff28 --- /dev/null +++ b/src/common/counts_header.hpp @@ -0,0 +1,65 @@ +/* xcounts_utils: code for doing things with xcounts format and some + * for counts format that is common to several tools. + * + * Copyright (C) 2023 Andrew D. Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef COUNTS_HEADER_HPP +#define COUNTS_HEADER_HPP + +#include +#include +#include + +#include "bamxx.hpp" + +void +write_counts_header_from_chom_sizes(const std::vector &chrom_names, + const std::vector &chrom_sizes, + bamxx::bgzf_file &out); + +// returns -1 on failure, 0 on success +int +get_chrom_sizes_for_counts_header(const uint32_t n_threads, + const std::string &filename, + std::vector &chrom_names, + std::vector &chrom_sizes); + +void +write_counts_header_from_bam_header(const bamxx::bam_header &hdr, + bamxx::bgzf_file &out); + +bool +write_counts_header_line(std::string line, bamxx::bgzf_file &out); + +bamxx::bgzf_file & +skip_counts_header(bamxx::bgzf_file &in); + +bool +has_counts_header(const std::string &filename); + +inline bool +is_counts_header_version_line(const std::string &line) { + const auto version_line = "#DNMTOOLS"; + return line.compare(0, 9, version_line) == 0; +} + +template +inline bool +is_counts_header_line(T &line) { + return line[0] == '#'; +} + +#endif From 2bfcef004f43426d6ddd20ad21f1de46167567e0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:05:50 -0800 Subject: [PATCH 16/23] MSite.cpp: adding a loop to pass the header when trying to determine if a file is a counts file --- src/common/MSite.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index 36e47fcf..27bd1a22 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -24,6 +24,7 @@ #include #include "smithlab_utils.hpp" +#include "counts_header.hpp" using std::string; using std::runtime_error; @@ -361,7 +362,8 @@ is_msite_file(const string &file) { if (!in) throw runtime_error("cannot open file: " + file); string line; - if (!getline(in, line)) return false; + while (getline(in, line) && is_counts_header_line(line)) + ; return is_msite_line(line); } From dd6cb9d6164c291df9a66720c83c896322671786 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:06:19 -0800 Subject: [PATCH 17/23] hmr.cpp and pmd.cpp: adding code to skip the header when reading sites --- src/analysis/hmr.cpp | 4 ++++ src/analysis/pmd.cpp | 27 +++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/analysis/hmr.cpp b/src/analysis/hmr.cpp index 4d06ffb4..882caed9 100644 --- a/src/analysis/hmr.cpp +++ b/src/analysis/hmr.cpp @@ -33,6 +33,7 @@ #include "TwoStateHMM.hpp" #include "MSite.hpp" +#include "counts_header.hpp" using std::string; using std::vector; @@ -337,6 +338,9 @@ load_cpgs(const string &cpgs_file, vector &cpgs, bgzf_file in(cpgs_file, "r"); if (!in) throw runtime_error("failed opening file: " + cpgs_file); + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + MSite prev_site, the_site; while (read_site(in, the_site)) { if (!the_site.is_cpg() || distance(prev_site, the_site) < 2) diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 92bee14b..07c3e3b1 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -33,6 +33,7 @@ #include "GenomicRegion.hpp" #include "OptionParser.hpp" #include "bsutils.hpp" +#include "counts_header.hpp" #include "TwoStateHMM_PMD.hpp" #include "MSite.hpp" @@ -240,8 +241,11 @@ get_optimized_boundary_likelihoods(const vector &cpgs_file, static const double array_coverage_constant = 10; vector in(cpgs_file.size()); - for (size_t i = 0; i < cpgs_file.size(); ++i) + for (size_t i = 0; i < cpgs_file.size(); ++i) { in[i] = new bgzf_file(cpgs_file[i], "r"); + if (has_counts_header(cpgs_file[i])) + skip_counts_header(*in[i]); + } std::map > pos_meth_tot; size_t n_meth = 0ul, n_reads = 0ul; @@ -344,8 +348,11 @@ find_exact_boundaries(const vector &cpgs_file, static const double array_coverage_constant = 10; vector in(cpgs_file.size()); - for (size_t i = 0; i < cpgs_file.size(); ++i) + for (size_t i = 0; i < cpgs_file.size(); ++i) { in[i] = new bgzf_file(cpgs_file[i], "r"); + if (has_counts_header(cpgs_file[i])) + skip_counts_header(*in[i]); + } std::map > pos_meth_tot; size_t n_meth = 0ul, n_reads = 0ul; @@ -653,6 +660,7 @@ shuffle_bins(const size_t rng_seed, sort(begin(domain_scores), end(domain_scores)); } + static void assign_p_values(const vector &random_scores, const vector &observed_scores, @@ -665,6 +673,7 @@ assign_p_values(const vector &random_scores, } } + static void read_params_file(const bool VERBOSE, const string ¶ms_file, @@ -759,6 +768,9 @@ check_if_array_data(const string &infile) { bgzf_file in(infile, "r"); if (!in) throw std::runtime_error("bad file: " + infile); + if (has_counts_header(infile)) + skip_counts_header(in); + std::string line; getline(in, line); std::istringstream iss(line); @@ -779,6 +791,10 @@ load_array_data(const size_t bin_size, bgzf_file in(cpgs_file, "r"); if (!in) throw std::runtime_error("bad sites file: " + cpgs_file); + + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + string curr_chrom; size_t prev_pos = 0ul, curr_pos = 0ul; double array_meth_bin = 0.0; @@ -872,6 +888,9 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file, bgzf_file in(cpgs_file, "r"); if (!in) throw runtime_error("bad sites file: " + cpgs_file); + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + // keep track of the chroms we've seen string curr_chrom; std::unordered_set chroms_seen; @@ -947,6 +966,9 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, bgzf_file in(cpgs_file, "r"); if (!in) throw runtime_error("bad methcounts file: " + cpgs_file); + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + // keep track of where we are and what we've seen size_t bin_start = 0ul; string curr_chrom; @@ -970,6 +992,7 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, } } + static double good_bins_frac(const vector &cumulative, const size_t min_bin_size, const size_t bin_size, const size_t min_cov_to_pass) { From 699a0f282fb0b86123f707254942a215d4d959d0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:06:53 -0800 Subject: [PATCH 18/23] methcounts.cpp: refactoring to move the header code into counts_header.chpp --- src/analysis/methcounts.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp index ea935799..4eab69e6 100644 --- a/src/analysis/methcounts.cpp +++ b/src/analysis/methcounts.cpp @@ -36,6 +36,7 @@ #include "bsutils.hpp" #include "dnmt_error.hpp" #include "bam_record_utils.hpp" +#include "counts_header.hpp" /* HTSlib */ #include @@ -382,16 +383,6 @@ consistent_targets(const bam_header &hdr, return true; } -static void -write_header(const bam_header &hdr, bgzf_file &out) { - for (auto i = 0; i < hdr.h->n_targets; ++i) { - const size_t tid_size = sam_hdr_tid2len(hdr, i); - const string tid_name = sam_hdr_tid2name(hdr, i); - std::ostringstream oss; - oss << "#" << tid_name << ' ' << tid_size << '\n'; - out.write(oss.str()); - } -} template static void process_reads(const bool VERBOSE, const bool show_progress, @@ -442,7 +433,7 @@ process_reads(const bool VERBOSE, const bool show_progress, } if (include_header) - write_header(hdr, out); + write_counts_header_from_bam_header(hdr, out); // now iterate over the reads, switching chromosomes and writing // output as needed From edeb17daf3eaed1e2ac4adae51cd272f37d76e9b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:07:19 -0800 Subject: [PATCH 19/23] recovered.cpp: refactoring to move the header stuff into counts_header --- src/utils/recovered.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/utils/recovered.cpp b/src/utils/recovered.cpp index 0d30906e..c268c238 100644 --- a/src/utils/recovered.cpp +++ b/src/utils/recovered.cpp @@ -30,6 +30,7 @@ #include "smithlab_os.hpp" #include "bsutils.hpp" #include "dnmt_error.hpp" +#include "counts_header.hpp" #include "MSite.hpp" @@ -67,7 +68,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, int32_t prev_idx = -1; while (getline(in, line)) { - if (line[0] == '#') continue; + if (is_counts_header_line(line)) continue; line.resize(line.find_first_of(" \t")); if (line != prev_chrom) { if (verbose) cerr << "verifying: " << line << endl; @@ -277,7 +278,10 @@ process_sites(const bool verbose, const bool add_missing_chroms, string line; while (getline(in, line)) { - if (line[0] == '#') continue; + if (is_counts_header_line(line)) { + write_counts_header_line(line, out); + continue; + } site.initialize(line.data(), line.data() + size(line)); if (site.chrom != chrom_name) { From c13c28d98d3a186128c9af84d6c229b78598c368 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:07:36 -0800 Subject: [PATCH 20/23] smmetric-cpgs.cpp: refactoring to move the header stuff into counts_header --- src/utils/symmetric-cpgs.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index a69c0a7d..3c4190cb 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -33,6 +33,8 @@ #include "smithlab_os.hpp" #include "smithlab_utils.hpp" +#include "counts_header.hpp" + using std::cerr; using std::cout; using std::endl; @@ -61,9 +63,8 @@ get_first_site(T &in, T &out) { string line; bool within_header = true; while (within_header && getline(in, line)) { - if (line[0] == '#') { - line += '\n'; - out.write(line); + if (is_counts_header_line(line)) { + write_counts_header_line(line, out); } else { prev_site.initialize(line.data(), line.data() + size(line)); From 1e1707c53248d3cd5c234b95468163a8f4f9c592 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:10:19 -0800 Subject: [PATCH 21/23] xcounts.cpp: refactoring to move the header stuff into counts_header --- src/utils/xcounts.cpp | 84 +++++++++++++------------------------------ 1 file changed, 24 insertions(+), 60 deletions(-) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index 3a2dc912..ab271e6b 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -29,7 +29,10 @@ #include "OptionParser.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" + #include "MSite.hpp" +#include "counts_header.hpp" +#include "dnmt_error.hpp" using std::cerr; using std::cout; @@ -76,55 +79,10 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) { } -static void -get_chrom_sizes(const uint32_t n_threads, - const string &filename, - vector &chrom_names, vector &chrom_sizes) { - - bamxx::bam_tpool tpool(n_threads); - - bgzf_file in(filename, "r"); - if (!in) throw runtime_error("could not open file: " + filename); - if (n_threads > 1 && in.is_bgzf()) - tpool.set_io(in); - - chrom_names.clear(); - chrom_sizes.clear(); - - // use the kstring_t type to more directly use the BGZF file - kstring_t line{0, 0, nullptr}; - const int ret = ks_resize(&line, 1024); - if (ret) throw runtime_error("failed to acquire buffer"); - - uint64_t chrom_size = 0; - while (getline(in, line)) { - if (line.s[0] == '>') { - if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); - chrom_names.emplace_back(line.s + 1); - chrom_size = 0; - } - else chrom_size += line.l; - } - if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); - - ks_free(&line); - - assert(size(chrom_names) == size(chrom_sizes)); -} - -static void -write_header(const vector &chrom_names, - const vector &chrom_sizes, bgzf_file &out) { - for (auto i = 0u; i < size(chrom_sizes); ++i) { - const string tmp = - "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n"; - out.write(tmp.c_str()); - } -} - int main_xcounts(int argc, const char **argv) { try { + bool require_coverage = false; size_t n_threads = 1; string genome_file; @@ -139,6 +97,8 @@ main_xcounts(int argc, const char **argv) { false, outfile); opt_parse.add_opt("chroms", 'c', "make header from this reference", false, genome_file); + opt_parse.add_opt("reads", 'r', "ouput only sites with reads", + false, require_coverage); opt_parse.add_opt("threads", 't', "threads for compression (use few)", false, n_threads); std::vector leftover_args; @@ -165,18 +125,24 @@ main_xcounts(int argc, const char **argv) { vector chrom_names; vector chrom_sizes; - if (!genome_file.empty()) - get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes); + if (!genome_file.empty()) { + const int ret = + get_chrom_sizes_for_counts_header(n_threads, genome_file, + chrom_names, chrom_sizes); + if (ret) throw dnmt_error("failed to get chrom sizes from: " + + genome_file); + } + bamxx::bam_tpool tpool(n_threads); bgzf_file in(filename, "r"); - if (!in) throw runtime_error("could not open file: " + filename); + if (!in) throw dnmt_error("could not open file: " + filename); const auto outfile_mode = in.is_compressed() ? "w" : "wu"; bgzf_file out(outfile, outfile_mode); - if (!out) throw runtime_error("error opening output file: " + outfile); + if (!out) throw dnmt_error("error opening output file: " + outfile); if (n_threads > 1) { // ADS: something breaks when we use the thread for the input @@ -186,27 +152,25 @@ main_xcounts(int argc, const char **argv) { } if (!genome_file.empty()) - write_header(chrom_names, chrom_sizes, out); + write_counts_header_from_chom_sizes(chrom_names, chrom_sizes, out); // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); - if (ret) throw runtime_error("failed to acquire buffer"); + if (ret) throw dnmt_error("failed to acquire buffer"); vector buf(128); uint32_t offset = 0; string prev_chrom; bool status_ok = true; - int64_t sz = 0; MSite site; while (status_ok && getline(in, line)) { - if (line.s[0] == '#') { + if (is_counts_header_line(line.s)) { if (!genome_file.empty()) continue; - const auto header_line = string(line.s) + "\n"; - sz = size(header_line); - status_ok = bgzf_write(out.f, header_line.data(), sz) == sz; + const string header_line{line.s}; + write_counts_header_line(header_line, out); continue; } status_ok = site.initialize(line.s, line.s + line.l); @@ -217,11 +181,11 @@ main_xcounts(int argc, const char **argv) { offset = 0; site.chrom += '\n'; - sz = size(site.chrom); + const int64_t sz = size(site.chrom); status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz; } if (site.n_reads > 0) { - sz = fill_output_buffer(offset, site, buf); + const int64_t sz = fill_output_buffer(offset, site, buf); status_ok = bgzf_write(out.f, buf.data(), sz) == sz; offset = site.pos; } @@ -234,7 +198,7 @@ main_xcounts(int argc, const char **argv) { return EXIT_FAILURE; } } - catch (const std::runtime_error &e) { + catch (const std::exception &e) { cerr << e.what() << endl; return EXIT_FAILURE; } From 6acba7b2629822afdd714b94e3c2d24df9e4f668 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 9 Dec 2023 14:10:33 -0800 Subject: [PATCH 22/23] unxcounts.cpp: refactoring to move the header stuff into counts_header --- src/utils/unxcounts.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 7276a964..156eb885 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -31,6 +31,7 @@ #include "smithlab_os.hpp" #include "bsutils.hpp" #include "dnmt_error.hpp" +#include "counts_header.hpp" #include "MSite.hpp" @@ -97,7 +98,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, if (ret) throw runtime_error("failed to acquire buffer"); while (getline(in, line)) { - if (line.s[0] == '#') continue; + if (is_counts_header_line(line.s)) continue; if (std::isdigit(line.s[0])) continue; string chrom{line.s}; @@ -252,8 +253,10 @@ write_site(const string &name, const string &chrom, return bgzf_write(out.f, buf.data(), sz) == sz; } + typedef vector::const_iterator chrom_itr_t; + static chrom_itr_t get_chrom(const unordered_map &chrom_lookup, const string &chrom_name) { @@ -263,6 +266,7 @@ get_chrom(const unordered_map &chrom_lookup, return chrom_idx->second; } + static int32_t get_chrom_idx(const unordered_map &name_to_idx, const string &chrom_name) { @@ -277,6 +281,8 @@ static bool verify_chrom(const string &header_line, const unordered_map &name_to_idx, vector &chrom_sizes) { + if (is_counts_header_version_line(header_line)) + return true; std::istringstream iss(header_line.substr(1)); string name; uint64_t chrom_size = 0; @@ -348,14 +354,12 @@ process_sites(const bool verbose, const bool add_missing_chroms, chrom_itr_t chrom_itr; while (getline(in, line)) { - if (line.s[0] == '#') { + if (is_counts_header_line(line.s)) { string header_line{line.s}; - if (!verify_chrom(header_line, name_to_idx, chrom_sizes)) + if (size(header_line) > 1 && !verify_chrom(header_line, name_to_idx, chrom_sizes)) throw runtime_error("failed to verify header for: " + header_line); - line.s[line.l++] = '\n'; - const int64_t ret = bgzf_write(out.f, line.s, line.l); - if (ret != static_cast(line.l)) - throw runtime_error("failed to convert: " + to_string(ret)); + if (!write_counts_header_line(header_line, out)) + throw runtime_error("failed to write header line: " + header_line); continue; } From 32fd022c49214a7d84fcb008b9d0c09ffaf8107f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 11 Dec 2023 18:05:50 -0800 Subject: [PATCH 23/23] uniq.cpp: adding a variable to control the max buffer size which matters for some strange data sets with hundreds of thousands of reads at the same mapping location --- src/utils/uniq.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/utils/uniq.cpp b/src/utils/uniq.cpp index b5a7c77b..41a64735 100644 --- a/src/utils/uniq.cpp +++ b/src/utils/uniq.cpp @@ -197,9 +197,10 @@ process_buffer(const bool add_dup_count, rd_stats &rs_out, size_t &reads_duped, } static void -uniq(const bool add_dup_count, const size_t n_threads, - const string &cmd, const string &infile, const string &statfile, - const string &histfile, const bool bam_format, const string &outfile) { +uniq(const bool add_dup_count, const uint32_t max_buffer_size, + const size_t n_threads, const string &cmd, const string &infile, + const string &statfile, const string &histfile, const bool bam_format, + const string &outfile) { // values to tabulate stats; no real cost rd_stats rs_in, rs_out; size_t reads_duped = 0; @@ -254,7 +255,10 @@ uniq(const bool add_dup_count, const size_t n_threads, if (!equivalent_chrom_and_start(buffer[0], aln)) process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, out); - buffer.push_back(aln); + if (size(buffer) < max_buffer_size || add_dup_count) + buffer.push_back(aln); + else if (!add_dup_count) + std::swap(buffer[uniq_random::rand() % max_buffer_size], aln); } process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, out); } @@ -266,6 +270,7 @@ uniq(const bool add_dup_count, const size_t n_threads, int main_uniq(int argc, const char **argv) { try { + uint32_t max_buffer_size = std::numeric_limits::max(); bool VERBOSE = false; bool bam_format = false; @@ -297,6 +302,8 @@ main_uniq(int argc, const char **argv) { opt_parse.add_opt("stdout", '\0', "write to standard output", false, use_stdout); opt_parse.add_opt("seed", 's', "random seed", false, the_seed); + opt_parse.add_opt("max", 'm', "max duplicates to consider", + false, max_buffer_size); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); opt_parse.set_show_defaults(); vector leftover_args; @@ -344,7 +351,7 @@ main_uniq(int argc, const char **argv) { << "[command line: \"" << cmd.str() << "\"]" << endl << "[random number seed: " << the_seed << "]" << endl; - uniq(add_dup_count, n_threads, cmd.str(), infile, statfile, + uniq(add_dup_count, max_buffer_size, n_threads, cmd.str(), infile, statfile, histfile, bam_format, outfile); } catch (const runtime_error &e) {