From 04261d01ff59b27f4af78e8de8342c43632cb333 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 28 Sep 2023 17:49:10 -0700 Subject: [PATCH] selectsites: adding a summary output file and making output code more uniform using bgzf output format even when not compressed --- src/utils/selectsites.cpp | 194 +++++++++++++++++++++++++++++++------- 1 file changed, 160 insertions(+), 34 deletions(-) diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index 1d9030ae..1539e654 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -43,11 +43,97 @@ using std::ios_base; using std::runtime_error; using std::ifstream; using std::unordered_map; +using std::tuple; +using std::tie; using bamxx::bgzf_file; namespace fs = std::filesystem; +struct quick_buf : public std::ostringstream, + public std::basic_stringbuf { + // ADS: By user ecatmur on SO; very fast. Seems to work... + quick_buf() { + // ...but this seems to depend on data layout + static_cast&>(*this).rdbuf(this); + } + void clear() { + // reset buffer pointers (member functions) + setp(pbase(), pbase()); + } + char const* c_str() { + /* between c_str and insertion make sure to clear() */ + *pptr() = '\0'; + return pbase(); + } +}; + +struct selectsites_summary { + + // command_line is the command used to produce this summary file and + // the corresponding results + string command_line{}; + + // n_target_regions is the number of target regions provided as + // input to the command + uint64_t n_target_regions{}; + + // target_region_size is the sum of the sizes of each target region + uint64_t target_region_size{}; + + // n_target_regions_collapsed is the number of target regions after + // having collapsed the input target regions merging those that + // overlap + uint64_t n_target_regions_collapsed{}; + + // target_region_collapsed_size is the sum of the sizes of target + // regions after collapsing + uint64_t target_region_collapsed_size{}; + + // n_sites_total is the total number of sites available in the input + // counts file. This value is displayed as zero if the command + // specified to process the sites on disk. + uint64_t n_sites_total{}; + + // n_sites_selected is the total number of sites in the output file + uint64_t n_sites_selected{}; + + template static auto measure_target_regions(const T &t) + -> tuple { + return { + std::size(t), + accumulate(cbegin(t), cend(t), 0ul, + [](const uint64_t a, const typename T::value_type &v) { + return a + v.get_width(); + }), + }; + } + + auto tostring() const -> string { + std::ostringstream oss; + oss << "command_line: " << command_line << '\n' + << "n_target_regions: " << n_target_regions << '\n' + << "target_region_size: " << target_region_size << '\n' + << "n_target_regions_collapsed: " << n_target_regions_collapsed << '\n' + << "target_region_collapsed_size: " << target_region_collapsed_size + << '\n' + << "n_sites_total: " << n_sites_total << '\n' + << "n_sites_selected: " << n_sites_selected; + return oss.str(); + } +}; + +static auto +write_stats_output(const selectsites_summary &summary, + const string &summary_file) -> void { + if (!summary_file.empty()) { + std::ofstream out_summary(summary_file); + if (!out_summary) throw runtime_error("bad summary output file"); + out_summary << summary.tostring() << endl; + } +} + + static void collapsebed(vector ®ions) { size_t j = 0; @@ -73,17 +159,21 @@ contains(const GenomicRegion &r, const MSite &s) { (r.get_start() <= s.pos && s.pos < r.get_end())); } -template static void +static auto process_all_sites(const bool VERBOSE, const string &sites_file, const unordered_map> ®ions, - T &out) { + bgzf_file &out) -> tuple { bgzf_file in(sites_file, "r"); if (!in) throw runtime_error("cannot open file: " + sites_file); + uint64_t n_sites_total = 0u; + uint64_t n_sites_selected = 0u; + MSite the_site, prev_site; vector::const_iterator i, i_lim; bool chrom_is_relevant = false; while (read_site(in, the_site)) { + ++n_sites_total; if (the_site.chrom != prev_site.chrom) { if (VERBOSE) cerr << "processing " << the_site.chrom << endl; const auto r = regions.find(the_site.chrom); @@ -96,41 +186,58 @@ process_all_sites(const bool VERBOSE, const string &sites_file, if (chrom_is_relevant) { while (i != i_lim && precedes(*i, the_site)) ++i; - if (i != i_lim && contains(*i, the_site)) + if (i != i_lim && contains(*i, the_site)) { + ++n_sites_selected; write_site(out, the_site); + } } std::swap(prev_site, the_site); } + return {n_sites_selected, n_sites_total}; } -static void +static auto get_sites_in_region(ifstream &site_in, const GenomicRegion ®ion, - std::ostream &out) { + bgzf_file &out) -> uint64_t { + quick_buf buf; + const string chrom{region.get_chrom()}; const size_t start_pos = region.get_start(); const size_t end_pos = region.get_end(); find_offset_for_msite(chrom, start_pos, site_in); MSite the_site; + uint64_t n_sites_selected = 0u; // ADS: should only happen once that "the_site.chrom < chrom" and // this is only needed because of end state of binary search on disk while (site_in >> the_site && (the_site.chrom < chrom || (the_site.chrom == chrom && the_site.pos < end_pos))) - if (start_pos <= the_site.pos) out << the_site << endl; + if (start_pos <= the_site.pos) { + // ADS: don't like this -- always needing to "clear()" this + // struct is bad... + buf.clear(); + buf << the_site << '\n'; + if (!out.write(buf.c_str(), buf.tellp())) + throw runtime_error("error writing output"); + ++n_sites_selected; + } + return n_sites_selected; } -static void + +static auto process_with_sites_on_disk(const string &sites_file, vector ®ions, - std::ostream &out) { - + bgzf_file &out) -> uint64_t { ifstream in(sites_file); if (!in) throw runtime_error("cannot open file: " + sites_file); - for (size_t i = 0; i < regions.size() && in; ++i) - get_sites_in_region(in, regions[i], out); + uint64_t n_sites_selected = 0ul; + for (auto i = 0u; i < size(regions) && in; ++i) + n_sites_selected += get_sites_in_region(in, regions[i], out); + return n_sites_selected; } @@ -164,15 +271,28 @@ is_compressed_file(const string &filename) { } +static auto +get_command_line(const int argc, const char **const argv) -> string { + if (argc == 0) return string(); + std::ostringstream cmd; + cmd << '"'; + copy(argv, argv + (argc - 1), std::ostream_iterator(cmd, " ")); + cmd << argv[argc-1] << '"'; + return cmd.str(); +} + + int main_selectsites(int argc, const char **argv) { try { bool VERBOSE = false; - bool load_entire_file = false; + bool keep_file_on_disk = false; + bool compress_output = false; - string outfile; + string outfile("-"); + string summary_file; const string description = "Select sites inside a set of genomic intervals. " @@ -184,9 +304,11 @@ main_selectsites(int argc, const char **argv) { " ", 2); opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, outfile); - opt_parse.add_opt("preload", 'p', - "preload sites (use for large target intervals)", - false, load_entire_file); + opt_parse.add_opt("disk", 'd', "process sites on disk " + "(fast if target intervals are few)", + false, keep_file_on_disk); + opt_parse.add_opt("summary", 'S', "write summary to this file", false, summary_file); + opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); opt_parse.set_show_defaults(); vector leftover_args; @@ -212,11 +334,14 @@ main_selectsites(int argc, const char **argv) { const string sites_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ + selectsites_summary summary; + summary.command_line = get_command_line(argc, argv); + if (!fs::is_regular_file(sites_file)) throw runtime_error("bad input sites file: " + sites_file); if (is_compressed_file(sites_file)) { - load_entire_file = true; + keep_file_on_disk = false; if (VERBOSE) cerr << "input file is so must be loaded" << endl; } @@ -225,6 +350,8 @@ main_selectsites(int argc, const char **argv) { ReadBEDFile(regions_file, regions); if (!check_sorted(regions)) throw runtime_error("regions not sorted in file: " + regions_file); + std::tie(summary.n_target_regions, summary.target_region_size) = + selectsites_summary::measure_target_regions(regions); const size_t n_orig_regions = regions.size(); collapsebed(regions); @@ -232,27 +359,26 @@ main_selectsites(int argc, const char **argv) { cerr << "[number of regions merged due to overlap: " << n_orig_regions - regions.size() << "]" << endl; + std::tie(summary.n_target_regions_collapsed, + summary.target_region_collapsed_size) = + selectsites_summary::measure_target_regions(regions); + unordered_map> regions_lookup; - if ((outfile.empty() || !has_gz_ext(outfile)) && load_entire_file) - regions_by_chrom(regions, regions_lookup); + if (!keep_file_on_disk) regions_by_chrom(regions, regions_lookup); - if (outfile.empty() || !has_gz_ext(outfile)) { - std::ofstream of; - if (!outfile.empty()) of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); - if (!outfile.empty() && !out) - throw runtime_error("failed to open output file: " + outfile); + // open the output file + const string output_mode = compress_output ? "w" : "wu"; + bamxx::bgzf_file out(outfile, output_mode); + if (!out) throw runtime_error("error opening output file: " + outfile); - if (load_entire_file) - process_all_sites(VERBOSE, sites_file, regions_lookup, out); - else + if (keep_file_on_disk) + summary.n_sites_selected = process_with_sites_on_disk(sites_file, regions, out); - } - else { - // not supporting search on disk for gz file - bgzf_file out(outfile, "w"); - process_all_sites(VERBOSE, sites_file, regions_lookup, out); - } + else + std::tie(summary.n_sites_selected, summary.n_sites_total) = + process_all_sites(VERBOSE, sites_file, regions_lookup, out); + + write_stats_output(summary, summary_file); } catch (const runtime_error &e) { cerr << e.what() << endl;