From f5968b3157665341f3239063b0cac1f6da0bc219 Mon Sep 17 00:00:00 2001
From: Andrew D Smith <andrewds@usc.edu>
Date: Thu, 15 Aug 2024 21:29:52 -0700
Subject: [PATCH] src/utils/clean-hairpins.cpp: major overhaul to code
 organization and many behaviors

---
 src/utils/clean-hairpins.cpp | 473 +++++++++++++++++++----------------
 1 file changed, 257 insertions(+), 216 deletions(-)
diff --git a/src/utils/clean-hairpins.cpp b/src/utils/clean-hairpins.cpp
index a937bccc..b696c15f 100644
--- a/src/utils/clean-hairpins.cpp
+++ b/src/utils/clean-hairpins.cpp
@@ -16,148 +16,136 @@
  * General Public License for more details.
  */
 
-#include <string>
-#include <vector>
-#include <iostream>
+#include <bamxx.hpp>
+
+#include <algorithm>
+#include <array>
 #include <fstream>
+#include <iomanip>
+#include <iostream>
 #include <iterator>
-#include <algorithm>
+#include <limits>
 #include <stdexcept>
+#include <string>
+#include <vector>
 
 #include "OptionParser.hpp"
-#include "smithlab_utils.hpp"
 #include "smithlab_os.hpp"
+#include "smithlab_utils.hpp"
 
-using std::string;
-using std::vector;
-using std::cout;
+using std::array;
 using std::cerr;
+using std::cout;
 using std::endl;
+using std::floor;
+using std::ifstream;
+using std::istream;
 using std::min;
+using std::ofstream;
+using std::ostream;
+using std::ostringstream;
 using std::runtime_error;
+using std::string;
+using std::vector;
+
+using bamxx::bgzf_file;
 
 // store each read from one end
 struct FASTQRecord {
   string name;
   string seq;
-  string score;
-
-  string tostring() const {
-    std::ostringstream s;
-    s << '@'
-      << name << '\n'
-      << seq << '\n'
-      << '+' << '\n'
-      << score;
-    return s.str();
-  }
+  string qual;
 };
 
-// see if two reads from two ends match to each other (they should
-// have the same name)
-static bool
-mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends
-      const FASTQRecord &a, const FASTQRecord &b) {
-  assert(to_ignore_at_end < a.name.length());
-  return equal(begin(a.name), end(a.name) - to_ignore_at_end, begin(b.name));
-}
-
-static std::ostream&
-operator<<(std::ostream& s, const FASTQRecord &r) {
-  return s << r.tostring();
+bgzf_file &
+operator<<(bgzf_file &out, const FASTQRecord &r) {
+  // below, the other chars are @, + and four newlines
+  static const uint32_t other_chars = 6;
+  const size_t buf_size =
+    size(r.name) + size(r.seq) + size(r.qual) + other_chars;
+  string buffer(buf_size, '\0');
+  string::iterator b = begin(buffer);
+  *b++ = '@';
+  b = copy(cbegin(r.name), cend(r.name), b);
+  *b++ = '\n';
+  b = copy(cbegin(r.seq), cend(r.seq), b);
+  *b++ = '\n';
+  *b++ = '+';
+  *b++ = '\n';
+  b = copy(cbegin(r.qual), cend(r.qual), b);
+  *b++ = '\n';
+  out.write(buffer);
+  return out;
 }
 
 // Read 4 lines one time from fastq and fill in the FASTQRecord structure
-static std::istream&
-operator>>(std::istream& s, FASTQRecord &r) {
-  if (getline(s, r.name)) {
+static bgzf_file &
+operator>>(bgzf_file &s, FASTQRecord &r) {
+  constexpr auto n_error_codes = 5u;
 
-    if (r.name.empty() || r.name[0] != '@')
-      throw std::runtime_error("bad name line: " + r.name);
+  enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual };
 
-    r.name = r.name.substr(1, r.name.find_first_of(' '));
+  static const array<runtime_error, n_error_codes> error_msg = {
+    runtime_error(""), runtime_error("failed to parse fastq name line"),
+    runtime_error("failed to parse fastq sequence line"),
+    runtime_error("failed to parse fastq plus line"),
+    runtime_error("failed to parse fastq qual line")};
 
-    if (!getline(s, r.seq))
-      throw runtime_error("failed to read expected seq line");
+  err_code ec = err_code::none;
 
-    string tmp;
-    if (!getline(s, tmp))
-      throw runtime_error("failed to read expected + line");
+  if (!getline(s, r.name)) return s;
 
-    if (!getline(s, r.score))
-      throw runtime_error("failed to read expected score line");
-  }
-  return s;
-}
+  if (r.name.empty() || r.name[0] != '@') ec = err_code::bad_name;
 
-////////////////////////////////////////////////////////////////////////////////
+  const auto nm_end = r.name.find_first_of(" \t");
+  const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1;
+  r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name));
 
-static bool
-similar_letters_bisulfite_tc_and_ag(const char a, const char b) {
-  return (a == b) || (a == 'T' && b == 'C') || (a == 'G' && b == 'A');
-}
+  if (!getline(s, r.seq)) ec = err_code::bad_seq;
 
-// compare two reads to detect the overlapped region
-static size_t
-similarity_both_bisulfite_convsersions(const string &s1, const string &s2) {
+  string tmp;
+  if (!getline(s, tmp)) ec = err_code::bad_plus;
 
-  const size_t lim = min(s1.length(), s2.length());
+  if (!getline(s, r.qual)) ec = err_code::bad_qual;
 
-  size_t count = 0;
-  for (size_t i = 0; i < lim; ++i)
-    if (similar_letters_bisulfite_tc_and_ag(s1[i], s2[i]))
-      count++;
+  if (ec != err_code::none) throw error_msg[ec];
 
-  return count;
+  return s;
 }
 
+static inline bool
+similar_letters_bisulfite_tc_and_ag(const char a, const char b) {
+  return (a != 'N' && a == b) || (a == 'T' && b == 'C') ||
+         (a == 'G' && b == 'A');
+}
 
+// compare two reads to detect the overlapped region
 static double
-check_hairpins(const size_t name_suffix_len,
-               const size_t reads_to_check, const double cutoff,
-               const string &reads_file1, const string &reads_file2) {
-
-  // Input: paired-end reads with end1 and end2
-  std::ifstream in1(reads_file1);
-  if (!in1)
-    throw runtime_error("cannot open input file: " + reads_file1);
-
-  std::ifstream in2(reads_file2);
-  if (!in2)
-    throw runtime_error("cannot open input file: " + reads_file2);
-
-  size_t n_reads = 0;
-  size_t n_bad_reads = 0;
-
-  FASTQRecord end_one, end_two;
-  while (n_reads < reads_to_check && in1 >> end_one && in2 >> end_two) {
-    ++n_reads;
-    // two reads should be in paired-ends
-    if (!mates(name_suffix_len, end_one, end_two))
-      throw runtime_error("expected mates, got:\n" +
-                          end_one.tostring() + "\nand:\n" +
-                          end_two.tostring());
-
-    // See if inverted duplicates emerge
-    const size_t sim =
-      similarity_both_bisulfite_convsersions(end_one.seq, end_two.seq);
-
-    const double min_len = min(end_one.seq.length(), end_two.seq.length());
-    const double percent_match = sim/min_len;
+similarity_both_bisulfite_convsersions(const string &s1, const string &s2) {
+  const size_t lim = min(size(s1), size(s2));
 
-    if (percent_match > cutoff)
-      ++n_bad_reads;
+  uint32_t total_letters = 0;
+  uint32_t matching_letters = 0;
+  for (size_t i = 0; i < lim; ++i) {
+    matching_letters += (similar_letters_bisulfite_tc_and_ag(s1[i], s2[i]));
+    total_letters += (valid_base(s1[i]) && valid_base(s2[i]));
   }
 
-  return static_cast<double>(n_bad_reads)/n_reads;
+  return static_cast<double>(matching_letters) / total_letters;
 }
 
 struct hp_summary {
 
+  explicit hp_summary(const double cutoff) : cutoff{cutoff} {}
+
   // n_reads is the total number of read pairs in the input fastq
   // files.
   uint64_t n_reads{};
 
+  // n_good_reads is the total number of read pairs that together have
+  // a fraction of good bases above the minimum specified.
+  uint64_t n_good_reads{};
+
   // n_hairpin_reads is the number of read pairs identified as being
   // hairpins using the criteria in the "cutoff" variable.
   uint64_t n_hairpin_reads{};
@@ -183,64 +171,188 @@ struct hp_summary {
   // sum_percent_match_bad over the total hairpin reads.
   double mean_percent_match_hairpin{};
 
-  auto assign_values() -> void {
+  auto
+  assign_values() -> void {
     mean_percent_match_non_hairpin =
       sum_percent_match_good / (n_reads - n_hairpin_reads);
     mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads;
   }
 
-  auto tostring() const -> string {
-    std::ostringstream oss;
+  auto
+  tostring() const -> string {
+    ostringstream oss;
     oss << "total_reads_pairs: " << n_reads << '\n'
+        << "good_reads_pairs: " << n_good_reads << '\n'
         << "hairpin_read_pairs: " << n_hairpin_reads << '\n'
         << "hairpin_cutoff: " << cutoff << '\n'
         << "sum_percent_match_good: " << sum_percent_match_good << '\n'
-        << "mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin << '\n'
+        << "mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin
+        << '\n'
         << "sum_percent_match_bad: " << sum_percent_match_bad << '\n'
         << "mean_percent_match_hairpin: " << mean_percent_match_hairpin << '\n';
     return oss.str();
   }
 };
 
+static void
+write_histogram(const string &hist_outfile, vector<double> hist) {
+  ofstream hist_out(hist_outfile);
+  if (!hist_out) throw runtime_error("failed to open file: " + hist_outfile);
+  const auto total = accumulate(cbegin(hist), cend(hist), 0.0);
+  transform(cbegin(hist), cend(hist), begin(hist),
+            [&total](const double t) { return t / total; });
+  const double increment = 1.0 / size(hist);
+  auto idx = 0;
+  hist_out << std::fixed;
+  hist_out.precision(3);
+  for (double offset = 0.0; offset < 1.0; offset += increment)
+    hist_out << offset << '\t' << hist[idx++] << '\n';
+}
+
+static void
+write_statistics(const string &filename, hp_summary hps) {
+  hps.assign_values();
+  ofstream out(filename);
+  if (!out) throw runtime_error("failed to open file: " + filename);
+  out << hps.tostring();
+}
+
+static inline double
+fraction_good_bases(const FASTQRecord &a, const FASTQRecord &b) {
+  const double a_bases = count_if(cbegin(a.seq), cend(a.seq), &valid_base);
+  const double b_bases = count_if(cbegin(b.seq), cend(b.seq), &valid_base);
+  return (a_bases + b_bases) / (size(a.seq) + size(b.seq));
+}
+
+struct clean_hairpin {
+  double cutoff{0.95};
+  uint64_t n_reads_to_check{std::numeric_limits<size_t>::max()};
+  double min_good_base_percent{0.5};
+  uint32_t min_read_length{32};
+  uint32_t n_hist_bins{20};
+  bool invert_output{false};
+
+  hp_summary
+  analyze_reads(const string &outfile1, const string &outfile2, bgzf_file &in1,
+                bgzf_file &in2, vector<double> &hist) const;
+  hp_summary
+  analyze_reads(bgzf_file &in1, bgzf_file &in2, vector<double> &hist) const;
+};
+
+hp_summary
+clean_hairpin::analyze_reads(const string &outfile1, const string &outfile2,
+                             bgzf_file &in1, bgzf_file &in2,
+                             vector<double> &hist) const {
+
+  // output files for read1 and read2 with hairpins removed
+  bgzf_file out1(outfile1, "w");
+  if (!out1) throw runtime_error("cannot open output file: " + outfile1);
+  bgzf_file out2(outfile2, "w");
+  if (!out2) throw runtime_error("cannot open output file: " + outfile2);
+
+  hp_summary hps{cutoff};
+  FASTQRecord r1, r2;
+  while (hps.n_reads < n_reads_to_check && in1 >> r1 && in2 >> r2) {
+    ++hps.n_reads;
+
+    if (fraction_good_bases(r1, r2) < min_good_base_percent ||
+        size(r1.seq) < min_read_length || size(r2.seq) < min_read_length)
+      continue;
+
+    ++hps.n_good_reads;
+
+    // See if inverted duplicates emerge
+    const double percent_match =
+      similarity_both_bisulfite_convsersions(r1.seq, r2.seq);
+
+    // ADS: need a bitter way to get this bin identifier
+    ++hist[floor(percent_match * n_hist_bins)];
+
+    if (percent_match > cutoff) {
+      hps.sum_percent_match_bad += percent_match;
+      hps.n_hairpin_reads++;
+      if (invert_output) {
+        out1 << r1;
+        out2 << r2;
+      }
+    }
+    else {
+      hps.sum_percent_match_good += percent_match;
+      if (!invert_output) {
+        out1 << r1;
+        out2 << r2;
+      }
+    }
+  }
+  return hps;
+}
+
+hp_summary
+clean_hairpin::analyze_reads(bgzf_file &in1, bgzf_file &in2,
+                             vector<double> &hist) const {
+  hp_summary hps{cutoff};
+  FASTQRecord r1, r2;
+  while (hps.n_reads < n_reads_to_check && in1 >> r1 && in2 >> r2) {
+    ++hps.n_reads;
+
+    if (fraction_good_bases(r1, r2) < min_good_base_percent ||
+        size(r1.seq) < min_read_length || size(r2.seq) < min_read_length)
+      continue;
+
+    ++hps.n_good_reads;
+
+    const double percent_match =
+      similarity_both_bisulfite_convsersions(r1.seq, r2.seq);
+    ++hist[floor(percent_match * n_hist_bins)];
+
+    if (percent_match > cutoff) {
+      hps.sum_percent_match_bad += percent_match;
+      hps.n_hairpin_reads++;
+    }
+    else
+      hps.sum_percent_match_good += percent_match;
+  }
+  return hps;
+}
+
 int
 main_clean_hairpins(int argc, const char **argv) {
+
+  static const string description = "fix and stat invdup/hairping reads";
+
   try {
 
-    string outfile;
+    string outfile1;
+    string outfile2;
     string stat_outfile;
     string hist_outfile;
 
-    double cutoff = 0.95;
-    size_t name_suffix_len = 0;
+    clean_hairpin ch{};
 
-    size_t reads_to_check = 1000000;
-    double max_hairpin_rate = 0.1;
-
-    bool VERBOSE = false;
-    bool check_first = false;
+    bool verbose = false;
 
     /****************** COMMAND LINE OPTIONS ********************/
-    OptionParser opt_parse(strip_path(argv[0]),
-                           "fix and stat invdup/hairping reads",
-                           "<end1-fastq> <end2-fastq>");
-    opt_parse.add_opt("output", 'o', "output filename", true, outfile);
-    opt_parse.add_opt("stat", 's', "stats output filename", true, stat_outfile);
-    opt_parse.add_opt("hairpin", 'H', "max hairpin rate", false,
-                      max_hairpin_rate);
-    opt_parse.add_opt("check", '\0', "check for hairpin contamination", false,
-                      check_first);
-    opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false,
-                      reads_to_check);
-    opt_parse.add_opt("cutoff", 'c',
-                      "cutoff for calling an invdup (default: 0.95)", false,
-                      cutoff);
-    opt_parse.add_opt("ignore", 'i',
-                      "length of read name suffix "
-                      "to ignore when matching",
-                      false, name_suffix_len);
-    opt_parse.add_opt("hist", '\0', "write a histogram of hairpin matches here",
-                      true, hist_outfile);
-    opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
+    OptionParser opt_parse(strip_path(argv[0]), description,
+                           "<end1-fastq> <end2-fastq>", true);
+    opt_parse.set_show_defaults();
+    opt_parse.add_opt("out1", 'o', "output file for read 1", false, outfile1);
+    opt_parse.add_opt("out2", 'p', "output file for read 2", false, outfile2);
+    opt_parse.add_opt("stat", 's', "stats output file", false, stat_outfile);
+    opt_parse.add_opt("nreads", 'n', "number of reads to process", false,
+                      ch.n_reads_to_check);
+    opt_parse.add_opt("cutoff", 'c', "min percent id to call a hairpin", false,
+                      ch.cutoff);
+    opt_parse.add_opt("good-bases", 'g', "min percent non-N to consider", false,
+                      ch.min_good_base_percent);
+    opt_parse.add_opt("length", 'l', "min read length to consider", false,
+                      ch.min_read_length);
+    opt_parse.add_opt("invert", 'I', "output hairpins instead good pairs",
+                      false, ch.invert_output);
+    opt_parse.add_opt("hist", 'H', "write a histogram of hairpin matches here",
+                      false, hist_outfile);
+    opt_parse.add_opt("bins", 'b', "number of histograms bins", false,
+                      ch.n_hist_bins);
+    opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose);
     vector<string> leftover_args;
     opt_parse.parse(argc, argv, leftover_args);
     if (argc == 1 || opt_parse.help_requested()) {
@@ -260,104 +372,33 @@ main_clean_hairpins(int argc, const char **argv) {
     const string reads_file2(leftover_args.back());
     /****************** END COMMAND LINE OPTIONS *****************/
 
-    double hairpin_fraction = 0.0;
-    if (check_first) {
-      if (VERBOSE)
-        cerr << "testing for hairpin contamination" << endl;
-
-      hairpin_fraction =
-        check_hairpins(name_suffix_len,
-                       reads_to_check, cutoff, reads_file1, reads_file2);
-
-      if (VERBOSE) {
-        if (hairpin_fraction > max_hairpin_rate)
-          cerr << "hairpin problem detected" << endl;
-        else
-          cerr << "no hairpin problem detected" << endl;
-      }
+    if (outfile1.empty() != outfile2.empty()) {
+      // ADS: add message about number of reads to check and output files
+      cerr << "error: specify both or neither of out1/o and out2/p" << endl;
+      return EXIT_FAILURE;
     }
 
-    hp_summary hps;
-    hps.cutoff = cutoff;
-    vector<double> hist(20, 0.0);
-
-    if (!check_first || hairpin_fraction > max_hairpin_rate) {
-
-      // Input: paired-end reads with end1 and end2
-      std::ifstream in1(reads_file1);
-      if (!in1)
-        throw runtime_error("cannot open input file: " + reads_file1);
-
-      std::ifstream in2(reads_file2);
-      if (!in2)
-        throw runtime_error("cannot open input file: " + reads_file2);
-
-      // output read2 with hairpins removed
-      std::ofstream out(outfile);
-      if (!out)
-        throw runtime_error("cannot open output file: " + outfile);
+    const bool write_reads = !outfile1.empty();
 
-      FASTQRecord end_one, end_two;
-      while (in1 >> end_one && in2 >> end_two) {
-        hps.n_reads++;
+    vector<double> hist(ch.n_hist_bins, 0.0);
 
-        // two reads should be in paired-ends
-        if (!mates(name_suffix_len, end_one, end_two))
-          throw runtime_error("expected mates, got:\n" +
-                              end_one.tostring() + "\nand:\n" +
-                              end_two.tostring());
+    // Input: paired-end reads with end1 and end2
+    bgzf_file in1(reads_file1, "r");
+    if (!in1) throw runtime_error("cannot open input file: " + reads_file1);
+    bgzf_file in2(reads_file2, "r");
+    if (!in2) throw runtime_error("cannot open input file: " + reads_file2);
 
-        // See if inverted duplicates emerge
-        const size_t sim =
-          similarity_both_bisulfite_convsersions(end_one.seq, end_two.seq);
+    const hp_summary hps =
+      write_reads ? ch.analyze_reads(outfile1, outfile2, in1, in2, hist)
+                  : ch.analyze_reads(in1, in2, hist);
 
-        const double min_len = min(end_one.seq.length(), end_two.seq.length());
-        const double percent_match = sim/min_len;
+    if (!stat_outfile.empty()) write_statistics(stat_outfile, hps);
 
-        // ADS: need a bitter way to get this bin identifier
-        const int hist_bin = hist.size()*((sim - 0.001)/(min_len + 0.001));
-        hist[hist_bin]++;
-
-        if (percent_match > cutoff) {
-
-          hps.sum_percent_match_bad += percent_match;
-          hps.n_hairpin_reads++;
-
-          end_two.seq.clear();
-          end_two.score.clear();
-        }
-        else
-          hps.sum_percent_match_good += percent_match;
-
-        out << end_two << '\n';
-      }
-
-      std::ofstream stat_out(stat_outfile);
-      if (!stat_out)
-        throw runtime_error("failed to open file: " + stat_outfile);
-      hps.assign_values();
-      stat_out << hps.tostring();
-
-      if (!hist_outfile.empty()) {
-        std::ofstream hist_out(hist_outfile);
-        if (!hist_out)
-          throw runtime_error("failed to open file: " + hist_outfile);
-        const auto total = accumulate(cbegin(hist), cend(hist), 0.0);
-        transform(cbegin(hist), cend(hist), begin(hist),
-                  [&total](const double t) { return t / total; });
-        hist_out.precision(3);
-        for (auto i = 0u; i < std::size(hist); ++i)
-          hist_out << i << '\t' << hist[i] << '\n';
-      }
-    }
+    if (!hist_outfile.empty()) write_histogram(hist_outfile, hist);
   }
-  catch (const runtime_error &e) {
+  catch (const std::exception &e) {
     cerr << e.what() << endl;
     return EXIT_FAILURE;
   }
-  catch (std::bad_alloc &ba) {
-    cerr << "ERROR: could not allocate memory" << endl;
-    return EXIT_FAILURE;
-  }
   return EXIT_SUCCESS;
 }