From b786395db2b38ca2e9b7e8372b15fc5d20f33bb6 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 16:24:09 -0800
Subject: [PATCH 01/23] xcounts: adding code to get a header from a fasta
 reference genome file

---
 src/utils/xcounts.cpp | 57 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index a1ffe46a..45964952 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -38,6 +38,7 @@ using std::runtime_error;
 using std::string;
 using std::to_chars;
 using std::vector;
+using std::to_string;
 
 using bamxx::bgzf_file;
 
@@ -75,10 +76,55 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) {
 }
 
 
+static void
+get_chrom_sizes(const uint32_t n_threads, const string &filename,
+                vector<string> &chrom_names, vector<uint64_t> &chrom_sizes) {
+
+  bamxx::bam_tpool tpool(n_threads);
+
+  bgzf_file in(filename, "r");
+  if (!in) throw runtime_error("could not open file: " + filename);
+  if (n_threads > 1)
+    tpool.set_io(in);
+
+  chrom_names.clear();
+  chrom_sizes.clear();
+
+  // use the kstring_t type to more directly use the BGZF file
+  kstring_t line{0, 0, nullptr};
+  const int ret = ks_resize(&line, 1024);
+  if (ret) throw runtime_error("failed to acquire buffer");
+
+  uint64_t chrom_size = 0;
+  while (getline(in, line)) {
+    if (line.s[0] == '>') {
+      if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
+      chrom_names.emplace_back(line.s + 1);
+      chrom_size = 0;
+    }
+    else chrom_size += line.l;
+  }
+  if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
+
+  assert(size(chrom_names) == size(chrom_sizes));
+}
+
+static void
+write_header(const vector<string> &chrom_names,
+             const vector<uint64_t> &chrom_sizes, bgzf_file &out) {
+  for (auto i = 0u; i < size(chrom_sizes); ++i) {
+    const string tmp =
+      "# " + chrom_names[i] + "\t" + to_string(chrom_sizes[i]) + "\n";
+    out.write(tmp.c_str());
+  }
+  out.write("# end_header\n");
+}
+
 int
 main_xcounts(int argc, const char **argv) {
   try {
     size_t n_threads = 1;
+    string genome_file;
 
     string outfile{"-"};
     const string description =
@@ -89,6 +135,8 @@ main_xcounts(int argc, const char **argv) {
                            "<counts-file> (\"-\" for standard input)", 1);
     opt_parse.add_opt("output", 'o', "output file (default is standard out)",
                       false, outfile);
+    opt_parse.add_opt("chroms", 'c', "make header from this reference",
+                      false, genome_file);
     opt_parse.add_opt("threads", 't', "threads for compression (use few)",
                       false, n_threads);
     std::vector<string> leftover_args;
@@ -113,6 +161,11 @@ main_xcounts(int argc, const char **argv) {
     const string filename(leftover_args.front());
     /****************** END COMMAND LINE OPTIONS *****************/
 
+    vector<string> chrom_names;
+    vector<uint64_t> chrom_sizes;
+    if (!genome_file.empty())
+      get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes);
+
     bamxx::bam_tpool tpool(n_threads);
 
     bgzf_file in(filename, "r");
@@ -129,6 +182,9 @@ main_xcounts(int argc, const char **argv) {
       tpool.set_io(out);
     }
 
+    if (!genome_file.empty())
+      write_header(chrom_names, chrom_sizes, out);
+
     // use the kstring_t type to more directly use the BGZF file
     kstring_t line{0, 0, nullptr};
     const int ret = ks_resize(&line, 1024);
@@ -144,6 +200,7 @@ main_xcounts(int argc, const char **argv) {
     MSite site;
     while (status_ok && getline(in, line)) {
       if (line.s[0] == '#') {
+        if (!genome_file.empty()) continue;
         const auto header_line = string(line.s) + "\n";
         sz = size(header_line);
         status_ok = bgzf_write(out.f, header_line.data(), sz) == sz;

From 09eea218c470c9eb36d385fa502fdb9ed844bdc0 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 16:56:00 -0800
Subject: [PATCH 02/23] xcounts: testing some threading issues

---
 src/utils/xcounts.cpp | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index 45964952..da14d495 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -19,6 +19,7 @@
 #include <bamxx.hpp>
 
 #include <iostream>
+#include <fstream>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -77,35 +78,43 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) {
 
 
 static void
-get_chrom_sizes(const uint32_t n_threads, const string &filename,
+get_chrom_sizes(//const uint32_t n_threads,
+                const string &filename,
                 vector<string> &chrom_names, vector<uint64_t> &chrom_sizes) {
 
-  bamxx::bam_tpool tpool(n_threads);
+  // bamxx::bam_tpool tpool(n_threads);
 
-  bgzf_file in(filename, "r");
+  std::ifstream in(filename);
+  // bgzf_file in(filename, "r");
   if (!in) throw runtime_error("could not open file: " + filename);
-  if (n_threads > 1)
-    tpool.set_io(in);
+  // if (n_threads > 1)
+  //   tpool.set_io(in);
 
   chrom_names.clear();
   chrom_sizes.clear();
 
   // use the kstring_t type to more directly use the BGZF file
-  kstring_t line{0, 0, nullptr};
-  const int ret = ks_resize(&line, 1024);
-  if (ret) throw runtime_error("failed to acquire buffer");
+  string line;
+  // kstring_t line{0, 0, nullptr};
+  // const int ret = ks_resize(&line, 1024);
+  // if (ret) throw runtime_error("failed to acquire buffer");
 
   uint64_t chrom_size = 0;
   while (getline(in, line)) {
-    if (line.s[0] == '>') {
+    // if (line.s[0] == '>') {
+    if (line[0] == '>') {
       if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
-      chrom_names.emplace_back(line.s + 1);
+      // chrom_names.emplace_back(line.s + 1);
+      chrom_names.emplace_back(line.substr(1));
       chrom_size = 0;
     }
-    else chrom_size += line.l;
+    // else chrom_size += line.l;
+    else chrom_size += size(line);
   }
   if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
 
+  // ks_free(&line);
+
   assert(size(chrom_names) == size(chrom_sizes));
 }
 
@@ -164,7 +173,8 @@ main_xcounts(int argc, const char **argv) {
     vector<string> chrom_names;
     vector<uint64_t> chrom_sizes;
     if (!genome_file.empty())
-      get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes);
+      get_chrom_sizes(//n_threads,
+                      genome_file, chrom_names, chrom_sizes);
 
     bamxx::bam_tpool tpool(n_threads);
 
@@ -223,6 +233,8 @@ main_xcounts(int argc, const char **argv) {
         offset = site.pos;
       }
     }
+    ks_free(&line);
+
     if (!status_ok) {
       cerr << "failed converting "
            << filename << " to " << outfile << endl;

From 048408e407fe40a5a6c2b39c8a300af403c33ef6 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 17:24:24 -0800
Subject: [PATCH 03/23] several source files: checking that input file is BGZF
 format before using the threadpool for decompression

---
 src/amrfinder/amrfinder.cpp  |  4 ++--
 src/analysis/methstates.cpp  |  9 +++++---
 src/utils/covered.cpp        |  5 +++--
 src/utils/recovered.cpp      |  2 +-
 src/utils/symmetric-cpgs.cpp |  2 +-
 src/utils/unxcounts.cpp      |  2 +-
 src/utils/xcounts.cpp        | 40 +++++++++++++++---------------------
 7 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp
index 4ecb4812..11cb476a 100644
--- a/src/amrfinder/amrfinder.cpp
+++ b/src/amrfinder/amrfinder.cpp
@@ -472,10 +472,10 @@ main_amrfinder(int argc, const char **argv) {
     const EpireadStats epistat{low_prob, high_prob, critical_value,
                                max_itr, use_bic, correct_for_read_count};
 
-    // bamxx::bam_tpool tp(n_threads);
+    bamxx::bam_tpool tp(n_threads);
     bgzf_file in(reads_file, "r");
     if (!in) throw runtime_error("failed to open input file: " + reads_file);
-    // if (n_threads > 1) tp.set_io(in);
+    if (n_threads > 1 && in.is_bgzf()) tp.set_io(in);
 
     vector<GenomicRegion> amrs;
 
diff --git a/src/analysis/methstates.cpp b/src/analysis/methstates.cpp
index 9133bc75..18e28572 100644
--- a/src/analysis/methstates.cpp
+++ b/src/analysis/methstates.cpp
@@ -254,14 +254,17 @@ main_methstates(int argc, const char **argv) {
     bamxx::bam_header hdr(in);
     if (!hdr) throw dnmt_error("cannot read heade" + mapped_reads_file);
 
-    /* set the threads for the input file decompression */
-    if (n_threads > 1) tp.set_io(in);
-
     // open the output file
     const string output_mode = compress_output ? "w" : "wu";
     bamxx::bgzf_file out(outfile, output_mode);
     if (!out) throw dnmt_error("error opening output file: " + outfile);
 
+    /* set the threads for the input file decompression */
+    if (n_threads > 1) {
+      tp.set_io(in);
+      tp.set_io(out);
+    }
+
     vector<uint64_t> cpgs;
 
     unordered_set<string> chroms_seen;
diff --git a/src/utils/covered.cpp b/src/utils/covered.cpp
index b96aca56..26efbc75 100644
--- a/src/utils/covered.cpp
+++ b/src/utils/covered.cpp
@@ -113,14 +113,15 @@ main_covered(int argc, const char **argv) {
     bgzf_file in(filename, "r");
     if (!in) throw runtime_error("could not open file: " + filename);
 
-    const auto outfile_mode = in.compression() ? "w" : "wu";
+    const auto outfile_mode = in.is_compressed() ? "w" : "wu";
 
     bgzf_file out(outfile, outfile_mode);
     if (!out) throw runtime_error("error opening output file: " + outfile);
 
     if (n_threads > 1) {
       // ADS: something breaks when we use the thread for the input
-      tpool.set_io(in);
+      if (in.is_bgzf())
+        tpool.set_io(in);
       tpool.set_io(out);
     }
 
diff --git a/src/utils/recovered.cpp b/src/utils/recovered.cpp
index 60b9a62f..0d30906e 100644
--- a/src/utils/recovered.cpp
+++ b/src/utils/recovered.cpp
@@ -57,7 +57,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads,
 
   bamxx::bam_tpool tp(n_threads);
   // set the threads for the input file decompression
-  if (n_threads > 1)
+  if (n_threads > 1 && in.is_bgzf())
     tp.set_io(in);
 
   unordered_set<int32_t> chroms_seen;
diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp
index 0c3844ee..a69c0a7d 100644
--- a/src/utils/symmetric-cpgs.cpp
+++ b/src/utils/symmetric-cpgs.cpp
@@ -174,7 +174,7 @@ main_symmetric_cpgs(int argc, const char **argv) {
     if (!out) throw runtime_error("error opening output file: " + outfile);
 
     if (n_threads > 1) {
-      tp.set_io(in);
+      if (in.is_bgzf()) tp.set_io(in);
       tp.set_io(out);
     }
 
diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index d78cf4e7..1229c279 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -85,7 +85,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads,
 
   bamxx::bam_tpool tp(n_threads);
   // set the threads for the input file decompression
-  if (n_threads > 1)
+  if (n_threads > 1 && in.is_bgzf())
     tp.set_io(in);
 
   unordered_set<int32_t> chroms_seen;
diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index da14d495..1585ef0c 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -19,7 +19,6 @@
 #include <bamxx.hpp>
 
 #include <iostream>
-#include <fstream>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -78,42 +77,37 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) {
 
 
 static void
-get_chrom_sizes(//const uint32_t n_threads,
+get_chrom_sizes(const uint32_t n_threads,
                 const string &filename,
                 vector<string> &chrom_names, vector<uint64_t> &chrom_sizes) {
 
-  // bamxx::bam_tpool tpool(n_threads);
+  bamxx::bam_tpool tpool(n_threads);
 
-  std::ifstream in(filename);
-  // bgzf_file in(filename, "r");
+  bgzf_file in(filename, "r");
   if (!in) throw runtime_error("could not open file: " + filename);
-  // if (n_threads > 1)
-  //   tpool.set_io(in);
+  if (n_threads > 1 && in.is_bgzf())
+    tpool.set_io(in);
 
   chrom_names.clear();
   chrom_sizes.clear();
 
   // use the kstring_t type to more directly use the BGZF file
-  string line;
-  // kstring_t line{0, 0, nullptr};
-  // const int ret = ks_resize(&line, 1024);
-  // if (ret) throw runtime_error("failed to acquire buffer");
+  kstring_t line{0, 0, nullptr};
+  const int ret = ks_resize(&line, 1024);
+  if (ret) throw runtime_error("failed to acquire buffer");
 
   uint64_t chrom_size = 0;
   while (getline(in, line)) {
-    // if (line.s[0] == '>') {
-    if (line[0] == '>') {
+    if (line.s[0] == '>') {
       if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
-      // chrom_names.emplace_back(line.s + 1);
-      chrom_names.emplace_back(line.substr(1));
+      chrom_names.emplace_back(line.s + 1);
       chrom_size = 0;
     }
-    // else chrom_size += line.l;
-    else chrom_size += size(line);
+    else chrom_size += line.l;
   }
   if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
 
-  // ks_free(&line);
+  ks_free(&line);
 
   assert(size(chrom_names) == size(chrom_sizes));
 }
@@ -173,22 +167,22 @@ main_xcounts(int argc, const char **argv) {
     vector<string> chrom_names;
     vector<uint64_t> chrom_sizes;
     if (!genome_file.empty())
-      get_chrom_sizes(//n_threads,
-                      genome_file, chrom_names, chrom_sizes);
+      get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes);
 
     bamxx::bam_tpool tpool(n_threads);
 
     bgzf_file in(filename, "r");
     if (!in) throw runtime_error("could not open file: " + filename);
 
-    const auto outfile_mode = in.compression() ? "w" : "wu";
+    const auto outfile_mode = in.is_compressed() ? "w" : "wu";
 
     bgzf_file out(outfile, outfile_mode);
     if (!out) throw runtime_error("error opening output file: " + outfile);
 
     if (n_threads > 1) {
       // ADS: something breaks when we use the thread for the input
-      tpool.set_io(in);
+      if (in.is_bgzf())
+        tpool.set_io(in);
       tpool.set_io(out);
     }
 
@@ -208,7 +202,7 @@ main_xcounts(int argc, const char **argv) {
     int64_t sz = 0;
 
     MSite site;
-    while (status_ok && getline(in, line)) {
+    while (status_ok && (status_ok = bool(getline(in, line)))) {
       if (line.s[0] == '#') {
         if (!genome_file.empty()) continue;
         const auto header_line = string(line.s) + "\n";

From f658558a6b3d256dd64e22738ef7c9907262d495 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 17:28:41 -0800
Subject: [PATCH 04/23] bamxx submodule update

---
 src/bamxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bamxx b/src/bamxx
index 09cfe360..7c2228c2 160000
--- a/src/bamxx
+++ b/src/bamxx
@@ -1 +1 @@
-Subproject commit 09cfe360be875271014b2824fecd8b732e640630
+Subproject commit 7c2228c2f0233e9df1f1534f374e08c0a196ae55

From 5b3568a6d85d2fe9dec0a66f57d42b216cf900e3 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 17:36:39 -0800
Subject: [PATCH 05/23] xcounts: fixing a bug from previous commit

---
 src/utils/xcounts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index 1585ef0c..582fc7f6 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -202,7 +202,7 @@ main_xcounts(int argc, const char **argv) {
     int64_t sz = 0;
 
     MSite site;
-    while (status_ok && (status_ok = bool(getline(in, line)))) {
+    while (status_ok && getline(in, line)) {
       if (line.s[0] == '#') {
         if (!genome_file.empty()) continue;
         const auto header_line = string(line.s) + "\n";

From f50caba10e512373a5cab2620ed6362226f69591 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 17:56:12 -0800
Subject: [PATCH 06/23] unxcounts: including logic to verify header against
 chromosomes sequences currently just using length

---
 src/utils/unxcounts.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index 1229c279..c08e5908 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -271,6 +271,24 @@ get_chrom_idx(const unordered_map<string, int32_t> &name_to_idx,
   return chrom_idx->second;
 }
 
+
+static bool
+verify_chrom(const string &header_line,
+             const unordered_map<string, int32_t> &name_to_idx,
+             vector<uint64_t> &chrom_sizes) {
+  std::istringstream iss(header_line.substr(1));
+  string name;
+  uint64_t chrom_size = 0;
+  if (!(iss >> name >> chrom_size))
+    return false;
+
+  const auto idx = name_to_idx.find(name);
+  if (idx == cend(name_to_idx)) return false;
+
+  return chrom_size == chrom_sizes[idx->second];
+}
+
+
 static void
 process_sites(const bool verbose, const bool add_missing_chroms,
               const bool compress_output, const size_t n_threads,
@@ -309,7 +327,7 @@ process_sites(const bool verbose, const bool add_missing_chroms,
 
   // set the threads for the input file decompression
   if (n_threads > 1) {
-    tp.set_io(in);
+    if (in.is_bgzf()) tp.set_io(in);
     tp.set_io(out);
   }
 
@@ -329,6 +347,9 @@ process_sites(const bool verbose, const bool add_missing_chroms,
 
   while (getline(in, line)) {
     if (line.s[0] == '#') {
+      string header_line{line.s};
+      if (!verify_chrom(header_line, name_to_idx, chrom_sizes))
+        throw runtime_error("failed to convert");
       line.s[line.l++] = '\n';
       if (bgzf_write(out.f, line.s, line.l) != static_cast<int64_t>(line.l))
         throw runtime_error("failed to convert");

From 16a3fdc07e73113831cce77f95baac4424aa8048 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 19:37:25 -0800
Subject: [PATCH 07/23] unxcounts: adding logic to only generate the counts for
 covered sites

---
 src/utils/unxcounts.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index c08e5908..2db7ab23 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -291,6 +291,7 @@ verify_chrom(const string &header_line,
 
 static void
 process_sites(const bool verbose, const bool add_missing_chroms,
+              const bool require_covered,
               const bool compress_output, const size_t n_threads,
               const string &infile, const string &outfile,
               const string &chroms_file) {
@@ -390,14 +391,15 @@ process_sites(const bool verbose, const bool add_missing_chroms,
       res = from_chars(res.ptr + 1, end_line, n_unmeth);
 
       const auto curr_pos = pos + pos_step;
-      if (pos + 1 < curr_pos)
+      if (require_covered && pos + 1 < curr_pos)
         write_missing_sites(chrom_name, *chrom_itr, pos + 1, curr_pos, buf, out);
 
       write_site(chrom_name, *chrom_itr, curr_pos, n_meth, n_unmeth, buf, out);
       pos = curr_pos;
     }
   }
-  write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out);
+  if (require_covered)
+    write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out);
 
   if (add_missing_chroms) {
     const int32_t chrom_idx = size(chroms);
@@ -418,6 +420,7 @@ main_unxcounts(int argc, const char **argv) {
 
     bool verbose = false;
     bool add_missing_chroms = false;
+    bool require_covered = false;
     bool compress_output = false;
     size_t n_threads = 1;
 
@@ -431,6 +434,8 @@ main_unxcounts(int argc, const char **argv) {
                            "<xcounts-file>");
     opt_parse.add_opt("output", 'o', "output file (required)", true, outfile);
     opt_parse.add_opt("missing", 'm', "add missing chroms", false, add_missing_chroms);
+    opt_parse.add_opt("reads", 'r', "report only sites with reads", false,
+                      require_covered);
     opt_parse.add_opt("threads", 't', "number of threads", false, n_threads);
     opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)",
                       true , chroms_file);
@@ -455,11 +460,20 @@ main_unxcounts(int argc, const char **argv) {
       cerr << opt_parse.help_message() << endl;
       return EXIT_SUCCESS;
     }
+    if (require_covered && add_missing_chroms) {
+      cerr << "options mutually exclusive: reads and missing" << endl;
+      return EXIT_FAILURE;
+    }
     const string filename(leftover_args.front());
     /****************** END COMMAND LINE OPTIONS *****************/
 
-    process_sites(verbose, add_missing_chroms, compress_output, n_threads,
-                  filename, outfile, chroms_file);
+    if (require_covered && add_missing_chroms) {
+      cerr << "options mutually exclusive: reads and missing" << endl;
+      return EXIT_FAILURE;
+    }
+
+    process_sites(verbose, add_missing_chroms, require_covered, compress_output,
+                  n_threads, filename, outfile, chroms_file);
   }
   catch (const std::runtime_error &e) {
     cerr << e.what() << endl;

From 2583dc8b11d4fdc87edc380962691e5dee96d4e1 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 19:39:28 -0800
Subject: [PATCH 08/23] xcounts: ignoring the mutation mark on sites

---
 src/utils/xcounts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index 582fc7f6..98432c85 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -221,7 +221,7 @@ main_xcounts(int argc, const char **argv) {
         sz = size(site.chrom);
         status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz;
       }
-      if (site.n_reads > 0 || site.is_mutated()) {
+      if (site.n_reads > 0) {
         sz = fill_output_buffer(offset, site, buf);
         status_ok = bgzf_write(out.f, buf.data(), sz) == sz;
         offset = site.pos;

From 84846cee8c3d4d7d96d64216b27e37ba9fb45bbf Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 19:42:47 -0800
Subject: [PATCH 09/23] unxcounts: better error message on inconsistent header

---
 src/utils/unxcounts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index 2db7ab23..1abf49f7 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -350,7 +350,7 @@ process_sites(const bool verbose, const bool add_missing_chroms,
     if (line.s[0] == '#') {
       string header_line{line.s};
       if (!verify_chrom(header_line, name_to_idx, chrom_sizes))
-        throw runtime_error("failed to convert");
+        throw runtime_error("failed to verify header for: " + header_line);
       line.s[line.l++] = '\n';
       if (bgzf_write(out.f, line.s, line.l) != static_cast<int64_t>(line.l))
         throw runtime_error("failed to convert");

From 3edadb1d316b9d369307968501c83b9e81176a5b Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 19:44:39 -0800
Subject: [PATCH 10/23] unxcounts: trying to find the best header termination

---
 src/utils/unxcounts.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index 1abf49f7..160e7a4f 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -349,6 +349,7 @@ process_sites(const bool verbose, const bool add_missing_chroms,
   while (getline(in, line)) {
     if (line.s[0] == '#') {
       string header_line{line.s};
+      if (header_line == "# end_header") continue;
       if (!verify_chrom(header_line, name_to_idx, chrom_sizes))
         throw runtime_error("failed to verify header for: " + header_line);
       line.s[line.l++] = '\n';

From fd39ce8007f45663c6f5922799a35dbab17e5768 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 20:00:44 -0800
Subject: [PATCH 11/23] unxcounts: better handling of header

---
 src/utils/unxcounts.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index 160e7a4f..3903701c 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -50,6 +50,7 @@ using std::cbegin;
 using std::cend;
 using std::copy;
 using std::copy_n;
+using std::to_string;
 
 using bamxx::bgzf_file;
 
@@ -349,12 +350,12 @@ process_sites(const bool verbose, const bool add_missing_chroms,
   while (getline(in, line)) {
     if (line.s[0] == '#') {
       string header_line{line.s};
-      if (header_line == "# end_header") continue;
       if (!verify_chrom(header_line, name_to_idx, chrom_sizes))
         throw runtime_error("failed to verify header for: " + header_line);
       line.s[line.l++] = '\n';
-      if (bgzf_write(out.f, line.s, line.l) != static_cast<int64_t>(line.l))
-        throw runtime_error("failed to convert");
+      const int64_t ret = bgzf_write(out.f, line.s, line.l);
+      if (ret != static_cast<int64_t>(line.l))
+        throw runtime_error("failed to convert: " + to_string(ret));
       continue;
     }
 

From 6db3e530ddd3aa914f3996e1b39cef20e8b49145 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 20:01:15 -0800
Subject: [PATCH 12/23] xcounts and counts: header no longer ends with
 end_header and also there is no space after the # in header lines

---
 src/analysis/methcounts.cpp | 3 +--
 src/utils/xcounts.cpp       | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp
index 76aeaefd..ea935799 100644
--- a/src/analysis/methcounts.cpp
+++ b/src/analysis/methcounts.cpp
@@ -388,10 +388,9 @@ write_header(const bam_header &hdr, bgzf_file &out) {
     const size_t tid_size = sam_hdr_tid2len(hdr, i);
     const string tid_name = sam_hdr_tid2name(hdr, i);
     std::ostringstream oss;
-    oss << "# " << tid_name << ' ' << tid_size << '\n';
+    oss << "#" << tid_name << ' ' << tid_size << '\n';
     out.write(oss.str());
   }
-  out.write("# end_header\n");
 }
 
 template<const bool require_covered = false> static void
diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index 98432c85..3a2dc912 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -117,10 +117,9 @@ write_header(const vector<string> &chrom_names,
              const vector<uint64_t> &chrom_sizes, bgzf_file &out) {
   for (auto i = 0u; i < size(chrom_sizes); ++i) {
     const string tmp =
-      "# " + chrom_names[i] + "\t" + to_string(chrom_sizes[i]) + "\n";
+      "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n";
     out.write(tmp.c_str());
   }
-  out.write("# end_header\n");
 }
 
 int

From 76ecc8cb204a0c27897bd38ada191c3c26886164 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 20:07:11 -0800
Subject: [PATCH 13/23] unxcounts: fixing backwards condition for requiring
 coverage

---
 src/utils/unxcounts.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index 3903701c..c6ddd4c6 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -393,14 +393,14 @@ process_sites(const bool verbose, const bool add_missing_chroms,
       res = from_chars(res.ptr + 1, end_line, n_unmeth);
 
       const auto curr_pos = pos + pos_step;
-      if (require_covered && pos + 1 < curr_pos)
+      if (!require_covered && pos + 1 < curr_pos)
         write_missing_sites(chrom_name, *chrom_itr, pos + 1, curr_pos, buf, out);
 
       write_site(chrom_name, *chrom_itr, curr_pos, n_meth, n_unmeth, buf, out);
       pos = curr_pos;
     }
   }
-  if (require_covered)
+  if (!require_covered)
     write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out);
 
   if (add_missing_chroms) {

From 944e6606da5cff9e802ef9cec14d46fc86b66822 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Fri, 8 Dec 2023 20:18:16 -0800
Subject: [PATCH 14/23] unxcounts: adding a missed spot to check for required
 coverage

---
 src/utils/unxcounts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index c6ddd4c6..7276a964 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -361,7 +361,7 @@ process_sites(const bool verbose, const bool add_missing_chroms,
 
     if (!std::isdigit(line.s[0])) {
 
-      if (pos != num_lim<uint64_t>::max())
+      if (!require_covered && pos != num_lim<uint64_t>::max())
         write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out);
 
       chrom_name = string{line.s};

From 0e329e90da30697db72c56039132c704edd581fd Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:05:00 -0800
Subject: [PATCH 15/23] counts_header.cpp counts_header.hpp Makefile.am: adding
 counts_header stuff

---
 Makefile.am                  |   2 +
 src/common/counts_header.cpp | 163 +++++++++++++++++++++++++++++++++++
 src/common/counts_header.hpp |  65 ++++++++++++++
 3 files changed, 230 insertions(+)
 create mode 100644 src/common/counts_header.cpp
 create mode 100644 src/common/counts_header.hpp

diff --git a/Makefile.am b/Makefile.am
index 368a8d54..e30d299d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -145,6 +145,7 @@ noinst_LIBRARIES = libdnmtools.a
 libdnmtools_a_SOURCES = \
         src/common/dnmtools_gaussinv.cpp \
         src/common/bam_record_utils.cpp \
+        src/common/counts_header.cpp \
         src/common/BetaBin.cpp \
         src/common/EmissionDistribution.cpp \
         src/common/Epiread.cpp \
@@ -163,6 +164,7 @@ libdnmtools_a_SOURCES += \
 	src/bamxx/bamxx.hpp \
         src/common/dnmtools_gaussinv.hpp \
 	src/common/bam_record_utils.hpp \
+	src/common/counts_header.hpp \
         src/common/BetaBin.hpp \
         src/common/EmissionDistribution.hpp \
         src/common/Epiread.hpp \
diff --git a/src/common/counts_header.cpp b/src/common/counts_header.cpp
new file mode 100644
index 00000000..e6a814b0
--- /dev/null
+++ b/src/common/counts_header.cpp
@@ -0,0 +1,163 @@
+/* xcounts_utils: code for doing things with xcounts format and some
+ * for counts format that is common to several tools.
+ *
+ * Copyright (C) 2023 Andrew D. Smith
+ *
+ * Authors: Andrew D. Smith
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include "counts_header.hpp"
+
+#include <string>
+#include <vector>
+#include <cassert>
+#include <cstdint>
+#include <sstream>
+
+#include "bam_record_utils.hpp"
+
+// generated by autotools
+#include <config.h>
+
+#include "bamxx.hpp"
+
+using std::vector;
+using std::string;
+using std::to_string;
+
+using bamxx::bgzf_file;
+
+void
+write_counts_header_from_chom_sizes(const vector<string> &chrom_names,
+                                    const vector<uint64_t> &chrom_sizes,
+                                    bgzf_file &out) {
+  const auto version = "#DNMTOOLS " + string(VERSION) + "\n";
+  out.write(version.c_str());
+  for (auto i = 0u; i < size(chrom_sizes); ++i) {
+    const string tmp =
+      "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n";
+    out.write(tmp.c_str());
+  }
+  out.write("#\n");
+}
+
+
+inline bgzf_file &
+getline(bgzf_file &file, kstring_t &line) {
+  if (file.f == nullptr) return file;
+  const int x = bgzf_getline(file.f, '\n', &line);
+  if (x == -1) {
+    file.destroy();
+    free(line.s);
+    line = {0, 0, nullptr};
+  }
+  if (x < -1) {
+    // ADS: this is an error condition and should be handled
+    // differently from the EOF above.
+    file.destroy();
+    free(line.s);
+    line = {0, 0, nullptr};
+  }
+  return file;
+}
+
+
+bamxx::bgzf_file &
+skip_counts_header(bamxx::bgzf_file &in) {
+
+  // use the kstring_t type to more directly use the BGZF file
+  kstring_t line{0, 0, nullptr};
+  const int ret = ks_resize(&line, 1024);
+  if (ret) return in;
+
+  while (getline(in, line) && line.s[0] == '#') {
+    if (line.s[0] == '#' && line.l == 1)
+      return in;
+  }
+  // otherwise we have missed the final line of the header
+  assert(line.s[0] != '#');
+  return in;
+}
+
+
+int
+get_chrom_sizes_for_counts_header(const uint32_t n_threads,
+                                  const string &filename,
+                                  vector<string> &chrom_names,
+                                  vector<uint64_t> &chrom_sizes) {
+
+  bamxx::bam_tpool tpool(n_threads);
+
+  bgzf_file in(filename, "r");
+  if (!in) return -1;
+  if (n_threads > 1 && in.is_bgzf())
+    tpool.set_io(in);
+
+  chrom_names.clear();
+  chrom_sizes.clear();
+
+  // use the kstring_t type to more directly use the BGZF file
+  kstring_t line{0, 0, nullptr};
+  const int ret = ks_resize(&line, 1024);
+  if (ret) return -1;
+
+  uint64_t chrom_size = 0;
+  while (getline(in, line)) {
+    if (line.s[0] == '>') {
+      if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
+      chrom_names.emplace_back(line.s + 1);
+      chrom_size = 0;
+    }
+    else chrom_size += line.l;
+  }
+  if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
+
+  ks_free(&line);
+
+  assert(size(chrom_names) == size(chrom_sizes));
+
+  return 0;
+}
+
+
+void
+write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
+                                    bgzf_file &out) {
+  const auto version = "#DNMTOOLS " + string(VERSION) + "\n";
+  out.write(version.c_str());
+  for (auto i = 0; i < hdr.h->n_targets; ++i) {
+    const size_t tid_size = sam_hdr_tid2len(hdr, i);
+    const string tid_name = sam_hdr_tid2name(hdr, i);
+    std::ostringstream oss;
+    oss << "#" << tid_name << ' ' << tid_size << '\n';
+    out.write(oss.str());
+  }
+  out.write("#\n");
+}
+
+
+bool
+write_counts_header_line(string line, bgzf_file &out) {
+  line += '\n';
+  const int64_t ret = bgzf_write(out.f, line.data(), size(line));
+  return ret == static_cast<int64_t>(size(line));
+}
+
+bool
+has_counts_header(const string &filename) {
+  bgzf_file in(filename, "r");
+  if (!in) return false;
+  string line;
+  if (!getline(in, line)) return false;
+  return line[0] == '#';
+}
diff --git a/src/common/counts_header.hpp b/src/common/counts_header.hpp
new file mode 100644
index 00000000..171eff28
--- /dev/null
+++ b/src/common/counts_header.hpp
@@ -0,0 +1,65 @@
+/* xcounts_utils: code for doing things with xcounts format and some
+ * for counts format that is common to several tools.
+ *
+ * Copyright (C) 2023 Andrew D. Smith
+ *
+ * Authors: Andrew D. Smith
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef COUNTS_HEADER_HPP
+#define COUNTS_HEADER_HPP
+
+#include <string>
+#include <vector>
+#include <cstdint>
+
+#include "bamxx.hpp"
+
+void
+write_counts_header_from_chom_sizes(const std::vector<std::string> &chrom_names,
+                                    const std::vector<uint64_t> &chrom_sizes,
+                                    bamxx::bgzf_file &out);
+
+// returns -1 on failure, 0 on success
+int
+get_chrom_sizes_for_counts_header(const uint32_t n_threads,
+                                  const std::string &filename,
+                                  std::vector<std::string> &chrom_names,
+                                  std::vector<uint64_t> &chrom_sizes);
+
+void
+write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
+                                    bamxx::bgzf_file &out);
+
+bool
+write_counts_header_line(std::string line, bamxx::bgzf_file &out);
+
+bamxx::bgzf_file &
+skip_counts_header(bamxx::bgzf_file &in);
+
+bool
+has_counts_header(const std::string &filename);
+
+inline bool
+is_counts_header_version_line(const std::string &line) {
+  const auto version_line = "#DNMTOOLS";
+  return line.compare(0, 9, version_line) == 0;
+}
+
+template<typename T>
+inline bool
+is_counts_header_line(T &line) {
+  return line[0] == '#';
+}
+
+#endif

From 2bfcef004f43426d6ddd20ad21f1de46167567e0 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:05:50 -0800
Subject: [PATCH 16/23] MSite.cpp: adding a loop to pass the header when trying
 to determine if a file is a counts file

---
 src/common/MSite.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp
index 36e47fcf..27bd1a22 100644
--- a/src/common/MSite.cpp
+++ b/src/common/MSite.cpp
@@ -24,6 +24,7 @@
 #include <charconv>
 
 #include "smithlab_utils.hpp"
+#include "counts_header.hpp"
 
 using std::string;
 using std::runtime_error;
@@ -361,7 +362,8 @@ is_msite_file(const string &file) {
   if (!in) throw runtime_error("cannot open file: " + file);
 
   string line;
-  if (!getline(in, line)) return false;
+  while (getline(in, line) && is_counts_header_line(line))
+    ;
 
   return is_msite_line(line);
 }

From dd6cb9d6164c291df9a66720c83c896322671786 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:06:19 -0800
Subject: [PATCH 17/23] hmr.cpp and pmd.cpp: adding code to skip the header
 when reading sites

---
 src/analysis/hmr.cpp |  4 ++++
 src/analysis/pmd.cpp | 27 +++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/analysis/hmr.cpp b/src/analysis/hmr.cpp
index 4d06ffb4..882caed9 100644
--- a/src/analysis/hmr.cpp
+++ b/src/analysis/hmr.cpp
@@ -33,6 +33,7 @@
 
 #include "TwoStateHMM.hpp"
 #include "MSite.hpp"
+#include "counts_header.hpp"
 
 using std::string;
 using std::vector;
@@ -337,6 +338,9 @@ load_cpgs(const string &cpgs_file, vector<MSite> &cpgs,
   bgzf_file in(cpgs_file, "r");
   if (!in) throw runtime_error("failed opening file: " + cpgs_file);
 
+  if (has_counts_header(cpgs_file))
+    skip_counts_header(in);
+
   MSite prev_site, the_site;
   while (read_site(in, the_site)) {
     if (!the_site.is_cpg() || distance(prev_site, the_site) < 2)
diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp
index 92bee14b..07c3e3b1 100644
--- a/src/analysis/pmd.cpp
+++ b/src/analysis/pmd.cpp
@@ -33,6 +33,7 @@
 #include "GenomicRegion.hpp"
 #include "OptionParser.hpp"
 #include "bsutils.hpp"
+#include "counts_header.hpp"
 
 #include "TwoStateHMM_PMD.hpp"
 #include "MSite.hpp"
@@ -240,8 +241,11 @@ get_optimized_boundary_likelihoods(const vector<string> &cpgs_file,
   static const double array_coverage_constant = 10;
 
   vector<bgzf_file*> in(cpgs_file.size());
-  for (size_t i = 0; i < cpgs_file.size(); ++i)
+  for (size_t i = 0; i < cpgs_file.size(); ++i) {
     in[i] = new bgzf_file(cpgs_file[i], "r");
+    if (has_counts_header(cpgs_file[i]))
+      skip_counts_header(*in[i]);
+  }
 
   std::map<size_t, pair<size_t, size_t> > pos_meth_tot;
   size_t n_meth = 0ul, n_reads = 0ul;
@@ -344,8 +348,11 @@ find_exact_boundaries(const vector<string> &cpgs_file,
   static const double array_coverage_constant = 10;
 
   vector<bgzf_file*> in(cpgs_file.size());
-  for (size_t i = 0; i < cpgs_file.size(); ++i)
+  for (size_t i = 0; i < cpgs_file.size(); ++i) {
     in[i] = new bgzf_file(cpgs_file[i], "r");
+    if (has_counts_header(cpgs_file[i]))
+      skip_counts_header(*in[i]);
+  }
 
   std::map<size_t, pair<size_t, size_t> > pos_meth_tot;
   size_t n_meth = 0ul, n_reads = 0ul;
@@ -653,6 +660,7 @@ shuffle_bins(const size_t rng_seed,
   sort(begin(domain_scores), end(domain_scores));
 }
 
+
 static void
 assign_p_values(const vector<double> &random_scores,
                 const vector<double> &observed_scores,
@@ -665,6 +673,7 @@ assign_p_values(const vector<double> &random_scores,
   }
 }
 
+
 static void
 read_params_file(const bool VERBOSE,
                  const string &params_file,
@@ -759,6 +768,9 @@ check_if_array_data(const string &infile) {
   bgzf_file in(infile, "r");
   if (!in) throw std::runtime_error("bad file: " + infile);
 
+  if (has_counts_header(infile))
+    skip_counts_header(in);
+
   std::string line;
   getline(in, line);
   std::istringstream iss(line);
@@ -779,6 +791,10 @@ load_array_data(const size_t bin_size,
 
   bgzf_file in(cpgs_file, "r");
   if (!in) throw std::runtime_error("bad sites file: " + cpgs_file);
+
+  if (has_counts_header(cpgs_file))
+    skip_counts_header(in);
+
   string curr_chrom;
   size_t prev_pos = 0ul, curr_pos = 0ul;
   double array_meth_bin = 0.0;
@@ -872,6 +888,9 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file,
   bgzf_file in(cpgs_file, "r");
   if (!in) throw runtime_error("bad sites file: " + cpgs_file);
 
+  if (has_counts_header(cpgs_file))
+    skip_counts_header(in);
+
   // keep track of the chroms we've seen
   string curr_chrom;
   std::unordered_set<string> chroms_seen;
@@ -947,6 +966,9 @@ load_read_counts(const string &cpgs_file, const size_t bin_size,
   bgzf_file in(cpgs_file, "r");
   if (!in) throw runtime_error("bad methcounts file: " + cpgs_file);
 
+  if (has_counts_header(cpgs_file))
+    skip_counts_header(in);
+
   // keep track of where we are and what we've seen
   size_t bin_start = 0ul;
   string curr_chrom;
@@ -970,6 +992,7 @@ load_read_counts(const string &cpgs_file, const size_t bin_size,
   }
 }
 
+
 static double
 good_bins_frac(const vector<size_t> &cumulative, const size_t min_bin_size,
                const size_t bin_size, const size_t min_cov_to_pass) {

From 699a0f282fb0b86123f707254942a215d4d959d0 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:06:53 -0800
Subject: [PATCH 18/23] methcounts.cpp: refactoring to move the header code
 into counts_header.chpp

---
 src/analysis/methcounts.cpp | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp
index ea935799..4eab69e6 100644
--- a/src/analysis/methcounts.cpp
+++ b/src/analysis/methcounts.cpp
@@ -36,6 +36,7 @@
 #include "bsutils.hpp"
 #include "dnmt_error.hpp"
 #include "bam_record_utils.hpp"
+#include "counts_header.hpp"
 
 /* HTSlib */
 #include <htslib/sam.h>
@@ -382,16 +383,6 @@ consistent_targets(const bam_header &hdr,
   return true;
 }
 
-static void
-write_header(const bam_header &hdr, bgzf_file &out) {
-  for (auto i = 0; i < hdr.h->n_targets; ++i) {
-    const size_t tid_size = sam_hdr_tid2len(hdr, i);
-    const string tid_name = sam_hdr_tid2name(hdr, i);
-    std::ostringstream oss;
-    oss << "#" << tid_name << ' ' << tid_size << '\n';
-    out.write(oss.str());
-  }
-}
 
 template<const bool require_covered = false> static void
 process_reads(const bool VERBOSE, const bool show_progress,
@@ -442,7 +433,7 @@ process_reads(const bool VERBOSE, const bool show_progress,
   }
 
   if (include_header)
-    write_header(hdr, out);
+    write_counts_header_from_bam_header(hdr, out);
 
   // now iterate over the reads, switching chromosomes and writing
   // output as needed

From edeb17daf3eaed1e2ac4adae51cd272f37d76e9b Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:07:19 -0800
Subject: [PATCH 19/23] recovered.cpp: refactoring to move the header stuff
 into counts_header

---
 src/utils/recovered.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/utils/recovered.cpp b/src/utils/recovered.cpp
index 0d30906e..c268c238 100644
--- a/src/utils/recovered.cpp
+++ b/src/utils/recovered.cpp
@@ -30,6 +30,7 @@
 #include "smithlab_os.hpp"
 #include "bsutils.hpp"
 #include "dnmt_error.hpp"
+#include "counts_header.hpp"
 
 #include "MSite.hpp"
 
@@ -67,7 +68,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads,
   int32_t prev_idx = -1;
 
   while (getline(in, line)) {
-    if (line[0] == '#') continue;
+    if (is_counts_header_line(line)) continue;
     line.resize(line.find_first_of(" \t"));
     if (line != prev_chrom) {
       if (verbose) cerr << "verifying: " << line << endl;
@@ -277,7 +278,10 @@ process_sites(const bool verbose, const bool add_missing_chroms,
   string line;
 
   while (getline(in, line)) {
-    if (line[0] == '#') continue;
+    if (is_counts_header_line(line)) {
+      write_counts_header_line(line, out);
+      continue;
+    }
     site.initialize(line.data(), line.data() + size(line));
     if (site.chrom != chrom_name) {
 

From c13c28d98d3a186128c9af84d6c229b78598c368 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:07:36 -0800
Subject: [PATCH 20/23] smmetric-cpgs.cpp: refactoring to move the header stuff
 into counts_header

---
 src/utils/symmetric-cpgs.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp
index a69c0a7d..3c4190cb 100644
--- a/src/utils/symmetric-cpgs.cpp
+++ b/src/utils/symmetric-cpgs.cpp
@@ -33,6 +33,8 @@
 #include "smithlab_os.hpp"
 #include "smithlab_utils.hpp"
 
+#include "counts_header.hpp"
+
 using std::cerr;
 using std::cout;
 using std::endl;
@@ -61,9 +63,8 @@ get_first_site(T &in, T &out) {
   string line;
   bool within_header = true;
   while (within_header && getline(in, line)) {
-    if (line[0] == '#') {
-      line += '\n';
-      out.write(line);
+    if (is_counts_header_line(line)) {
+      write_counts_header_line(line, out);
     }
     else {
       prev_site.initialize(line.data(), line.data() + size(line));

From 1e1707c53248d3cd5c234b95468163a8f4f9c592 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:10:19 -0800
Subject: [PATCH 21/23] xcounts.cpp: refactoring to move the header stuff into
 counts_header

---
 src/utils/xcounts.cpp | 84 +++++++++++++------------------------------
 1 file changed, 24 insertions(+), 60 deletions(-)

diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp
index 3a2dc912..ab271e6b 100644
--- a/src/utils/xcounts.cpp
+++ b/src/utils/xcounts.cpp
@@ -29,7 +29,10 @@
 #include "OptionParser.hpp"
 #include "smithlab_os.hpp"
 #include "smithlab_utils.hpp"
+
 #include "MSite.hpp"
+#include "counts_header.hpp"
+#include "dnmt_error.hpp"
 
 using std::cerr;
 using std::cout;
@@ -76,55 +79,10 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) {
 }
 
 
-static void
-get_chrom_sizes(const uint32_t n_threads,
-                const string &filename,
-                vector<string> &chrom_names, vector<uint64_t> &chrom_sizes) {
-
-  bamxx::bam_tpool tpool(n_threads);
-
-  bgzf_file in(filename, "r");
-  if (!in) throw runtime_error("could not open file: " + filename);
-  if (n_threads > 1 && in.is_bgzf())
-    tpool.set_io(in);
-
-  chrom_names.clear();
-  chrom_sizes.clear();
-
-  // use the kstring_t type to more directly use the BGZF file
-  kstring_t line{0, 0, nullptr};
-  const int ret = ks_resize(&line, 1024);
-  if (ret) throw runtime_error("failed to acquire buffer");
-
-  uint64_t chrom_size = 0;
-  while (getline(in, line)) {
-    if (line.s[0] == '>') {
-      if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
-      chrom_names.emplace_back(line.s + 1);
-      chrom_size = 0;
-    }
-    else chrom_size += line.l;
-  }
-  if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
-
-  ks_free(&line);
-
-  assert(size(chrom_names) == size(chrom_sizes));
-}
-
-static void
-write_header(const vector<string> &chrom_names,
-             const vector<uint64_t> &chrom_sizes, bgzf_file &out) {
-  for (auto i = 0u; i < size(chrom_sizes); ++i) {
-    const string tmp =
-      "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n";
-    out.write(tmp.c_str());
-  }
-}
-
 int
 main_xcounts(int argc, const char **argv) {
   try {
+    bool require_coverage = false;
     size_t n_threads = 1;
     string genome_file;
 
@@ -139,6 +97,8 @@ main_xcounts(int argc, const char **argv) {
                       false, outfile);
     opt_parse.add_opt("chroms", 'c', "make header from this reference",
                       false, genome_file);
+    opt_parse.add_opt("reads", 'r', "ouput only sites with reads",
+                      false, require_coverage);
     opt_parse.add_opt("threads", 't', "threads for compression (use few)",
                       false, n_threads);
     std::vector<string> leftover_args;
@@ -165,18 +125,24 @@ main_xcounts(int argc, const char **argv) {
 
     vector<string> chrom_names;
     vector<uint64_t> chrom_sizes;
-    if (!genome_file.empty())
-      get_chrom_sizes(n_threads, genome_file, chrom_names, chrom_sizes);
+    if (!genome_file.empty()) {
+      const int ret =
+        get_chrom_sizes_for_counts_header(n_threads, genome_file,
+                                          chrom_names, chrom_sizes);
+      if (ret) throw dnmt_error("failed to get chrom sizes from: " +
+                                genome_file);
+    }
+
 
     bamxx::bam_tpool tpool(n_threads);
 
     bgzf_file in(filename, "r");
-    if (!in) throw runtime_error("could not open file: " + filename);
+    if (!in) throw dnmt_error("could not open file: " + filename);
 
     const auto outfile_mode = in.is_compressed() ? "w" : "wu";
 
     bgzf_file out(outfile, outfile_mode);
-    if (!out) throw runtime_error("error opening output file: " + outfile);
+    if (!out) throw dnmt_error("error opening output file: " + outfile);
 
     if (n_threads > 1) {
       // ADS: something breaks when we use the thread for the input
@@ -186,27 +152,25 @@ main_xcounts(int argc, const char **argv) {
     }
 
     if (!genome_file.empty())
-      write_header(chrom_names, chrom_sizes, out);
+      write_counts_header_from_chom_sizes(chrom_names, chrom_sizes, out);
 
     // use the kstring_t type to more directly use the BGZF file
     kstring_t line{0, 0, nullptr};
     const int ret = ks_resize(&line, 1024);
-    if (ret) throw runtime_error("failed to acquire buffer");
+    if (ret) throw dnmt_error("failed to acquire buffer");
 
     vector<char> buf(128);
 
     uint32_t offset = 0;
     string prev_chrom;
     bool status_ok = true;
-    int64_t sz = 0;
 
     MSite site;
     while (status_ok && getline(in, line)) {
-      if (line.s[0] == '#') {
+      if (is_counts_header_line(line.s)) {
         if (!genome_file.empty()) continue;
-        const auto header_line = string(line.s) + "\n";
-        sz = size(header_line);
-        status_ok = bgzf_write(out.f, header_line.data(), sz) == sz;
+        const string header_line{line.s};
+        write_counts_header_line(header_line, out);
         continue;
       }
       status_ok = site.initialize(line.s, line.s + line.l);
@@ -217,11 +181,11 @@ main_xcounts(int argc, const char **argv) {
         offset = 0;
 
         site.chrom += '\n';
-        sz = size(site.chrom);
+        const int64_t sz = size(site.chrom);
         status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz;
       }
       if (site.n_reads > 0) {
-        sz = fill_output_buffer(offset, site, buf);
+        const int64_t sz = fill_output_buffer(offset, site, buf);
         status_ok = bgzf_write(out.f, buf.data(), sz) == sz;
         offset = site.pos;
       }
@@ -234,7 +198,7 @@ main_xcounts(int argc, const char **argv) {
       return EXIT_FAILURE;
     }
   }
-  catch (const std::runtime_error &e) {
+  catch (const std::exception &e) {
     cerr << e.what() << endl;
     return EXIT_FAILURE;
   }

From 6acba7b2629822afdd714b94e3c2d24df9e4f668 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Sat, 9 Dec 2023 14:10:33 -0800
Subject: [PATCH 22/23] unxcounts.cpp: refactoring to move the header stuff
 into counts_header

---
 src/utils/unxcounts.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp
index 7276a964..156eb885 100644
--- a/src/utils/unxcounts.cpp
+++ b/src/utils/unxcounts.cpp
@@ -31,6 +31,7 @@
 #include "smithlab_os.hpp"
 #include "bsutils.hpp"
 #include "dnmt_error.hpp"
+#include "counts_header.hpp"
 
 #include "MSite.hpp"
 
@@ -97,7 +98,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads,
   if (ret) throw runtime_error("failed to acquire buffer");
 
   while (getline(in, line)) {
-    if (line.s[0] == '#') continue;
+    if (is_counts_header_line(line.s)) continue;
     if (std::isdigit(line.s[0])) continue;
 
     string chrom{line.s};
@@ -252,8 +253,10 @@ write_site(const string &name, const string &chrom,
   return bgzf_write(out.f, buf.data(), sz) == sz;
 }
 
+
 typedef vector<string>::const_iterator chrom_itr_t;
 
+
 static chrom_itr_t
 get_chrom(const unordered_map<string, chrom_itr_t> &chrom_lookup,
           const string &chrom_name) {
@@ -263,6 +266,7 @@ get_chrom(const unordered_map<string, chrom_itr_t> &chrom_lookup,
   return chrom_idx->second;
 }
 
+
 static int32_t
 get_chrom_idx(const unordered_map<string, int32_t> &name_to_idx,
               const string &chrom_name) {
@@ -277,6 +281,8 @@ static bool
 verify_chrom(const string &header_line,
              const unordered_map<string, int32_t> &name_to_idx,
              vector<uint64_t> &chrom_sizes) {
+  if (is_counts_header_version_line(header_line))
+    return true;
   std::istringstream iss(header_line.substr(1));
   string name;
   uint64_t chrom_size = 0;
@@ -348,14 +354,12 @@ process_sites(const bool verbose, const bool add_missing_chroms,
   chrom_itr_t chrom_itr;
 
   while (getline(in, line)) {
-    if (line.s[0] == '#') {
+    if (is_counts_header_line(line.s)) {
       string header_line{line.s};
-      if (!verify_chrom(header_line, name_to_idx, chrom_sizes))
+      if (size(header_line) > 1 && !verify_chrom(header_line, name_to_idx, chrom_sizes))
         throw runtime_error("failed to verify header for: " + header_line);
-      line.s[line.l++] = '\n';
-      const int64_t ret = bgzf_write(out.f, line.s, line.l);
-      if (ret != static_cast<int64_t>(line.l))
-        throw runtime_error("failed to convert: " + to_string(ret));
+      if (!write_counts_header_line(header_line, out))
+        throw runtime_error("failed to write header line: " + header_line);
       continue;
     }
 

From 32fd022c49214a7d84fcb008b9d0c09ffaf8107f Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Mon, 11 Dec 2023 18:05:50 -0800
Subject: [PATCH 23/23] uniq.cpp: adding a variable to control the max buffer
 size which matters for some strange data sets with hundreds of thousands of
 reads at the same mapping location

---
 src/utils/uniq.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/utils/uniq.cpp b/src/utils/uniq.cpp
index b5a7c77b..41a64735 100644
--- a/src/utils/uniq.cpp
+++ b/src/utils/uniq.cpp
@@ -197,9 +197,10 @@ process_buffer(const bool add_dup_count, rd_stats &rs_out, size_t &reads_duped,
 }
 
 static void
-uniq(const bool add_dup_count, const size_t n_threads,
-     const string &cmd, const string &infile, const string &statfile,
-     const string &histfile, const bool bam_format, const string &outfile) {
+uniq(const bool add_dup_count, const uint32_t max_buffer_size,
+     const size_t n_threads, const string &cmd, const string &infile,
+     const string &statfile, const string &histfile, const bool bam_format,
+     const string &outfile) {
   // values to tabulate stats; no real cost
   rd_stats rs_in, rs_out;
   size_t reads_duped = 0;
@@ -254,7 +255,10 @@ uniq(const bool add_dup_count, const size_t n_threads,
       if (!equivalent_chrom_and_start(buffer[0], aln))
         process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr,
                        out);
-      buffer.push_back(aln);
+      if (size(buffer) < max_buffer_size || add_dup_count)
+        buffer.push_back(aln);
+      else if (!add_dup_count)
+        std::swap(buffer[uniq_random::rand() % max_buffer_size], aln);
     }
     process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, out);
   }
@@ -266,6 +270,7 @@ uniq(const bool add_dup_count, const size_t n_threads,
 int
 main_uniq(int argc, const char **argv) {
   try {
+    uint32_t max_buffer_size = std::numeric_limits<uint32_t>::max();
     bool VERBOSE = false;
 
     bool bam_format = false;
@@ -297,6 +302,8 @@ main_uniq(int argc, const char **argv) {
     opt_parse.add_opt("stdout", '\0', "write to standard output", false,
                       use_stdout);
     opt_parse.add_opt("seed", 's', "random seed", false, the_seed);
+    opt_parse.add_opt("max", 'm', "max duplicates to consider",
+                      false, max_buffer_size);
     opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
     opt_parse.set_show_defaults();
     vector<string> leftover_args;
@@ -344,7 +351,7 @@ main_uniq(int argc, const char **argv) {
            << "[command line: \"" << cmd.str() << "\"]" << endl
            << "[random number seed: " << the_seed << "]" << endl;
 
-    uniq(add_dup_count, n_threads, cmd.str(), infile, statfile,
+    uniq(add_dup_count, max_buffer_size, n_threads, cmd.str(), infile, statfile,
          histfile, bam_format, outfile);
   }
   catch (const runtime_error &e) {