From f5968b3157665341f3239063b0cac1f6da0bc219 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Thu, 15 Aug 2024 21:29:52 -0700 Subject: [PATCH] src/utils/clean-hairpins.cpp: major overhaul to code organization and many behaviors --- src/utils/clean-hairpins.cpp | 473 +++++++++++++++++++---------------- 1 file changed, 257 insertions(+), 216 deletions(-) diff --git a/src/utils/clean-hairpins.cpp b/src/utils/clean-hairpins.cpp index a937bccc..b696c15f 100644 --- a/src/utils/clean-hairpins.cpp +++ b/src/utils/clean-hairpins.cpp @@ -16,148 +16,136 @@ * General Public License for more details. */ -#include -#include -#include +#include + +#include +#include #include +#include +#include #include -#include +#include #include +#include +#include #include "OptionParser.hpp" -#include "smithlab_utils.hpp" #include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -using std::string; -using std::vector; -using std::cout; +using std::array; using std::cerr; +using std::cout; using std::endl; +using std::floor; +using std::ifstream; +using std::istream; using std::min; +using std::ofstream; +using std::ostream; +using std::ostringstream; using std::runtime_error; +using std::string; +using std::vector; + +using bamxx::bgzf_file; // store each read from one end struct FASTQRecord { string name; string seq; - string score; - - string tostring() const { - std::ostringstream s; - s << '@' - << name << '\n' - << seq << '\n' - << '+' << '\n' - << score; - return s.str(); - } + string qual; }; -// see if two reads from two ends match to each other (they should -// have the same name) -static bool -mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends - const FASTQRecord &a, const FASTQRecord &b) { - assert(to_ignore_at_end < a.name.length()); - return equal(begin(a.name), end(a.name) - to_ignore_at_end, begin(b.name)); -} - -static std::ostream& -operator<<(std::ostream& s, const FASTQRecord &r) { - return s << r.tostring(); +bgzf_file & +operator<<(bgzf_file &out, const FASTQRecord &r) { + // below, the other chars are @, + and four newlines + static const uint32_t other_chars = 6; + const size_t buf_size = + size(r.name) + size(r.seq) + size(r.qual) + other_chars; + string buffer(buf_size, '\0'); + string::iterator b = begin(buffer); + *b++ = '@'; + b = copy(cbegin(r.name), cend(r.name), b); + *b++ = '\n'; + b = copy(cbegin(r.seq), cend(r.seq), b); + *b++ = '\n'; + *b++ = '+'; + *b++ = '\n'; + b = copy(cbegin(r.qual), cend(r.qual), b); + *b++ = '\n'; + out.write(buffer); + return out; } // Read 4 lines one time from fastq and fill in the FASTQRecord structure -static std::istream& -operator>>(std::istream& s, FASTQRecord &r) { - if (getline(s, r.name)) { +static bgzf_file & +operator>>(bgzf_file &s, FASTQRecord &r) { + constexpr auto n_error_codes = 5u; - if (r.name.empty() || r.name[0] != '@') - throw std::runtime_error("bad name line: " + r.name); + enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual }; - r.name = r.name.substr(1, r.name.find_first_of(' ')); + static const array error_msg = { + runtime_error(""), runtime_error("failed to parse fastq name line"), + runtime_error("failed to parse fastq sequence line"), + runtime_error("failed to parse fastq plus line"), + runtime_error("failed to parse fastq qual line")}; - if (!getline(s, r.seq)) - throw runtime_error("failed to read expected seq line"); + err_code ec = err_code::none; - string tmp; - if (!getline(s, tmp)) - throw runtime_error("failed to read expected + line"); + if (!getline(s, r.name)) return s; - if (!getline(s, r.score)) - throw runtime_error("failed to read expected score line"); - } - return s; -} + if (r.name.empty() || r.name[0] != '@') ec = err_code::bad_name; -//////////////////////////////////////////////////////////////////////////////// + const auto nm_end = r.name.find_first_of(" \t"); + const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1; + r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name)); -static bool -similar_letters_bisulfite_tc_and_ag(const char a, const char b) { - return (a == b) || (a == 'T' && b == 'C') || (a == 'G' && b == 'A'); -} + if (!getline(s, r.seq)) ec = err_code::bad_seq; -// compare two reads to detect the overlapped region -static size_t -similarity_both_bisulfite_convsersions(const string &s1, const string &s2) { + string tmp; + if (!getline(s, tmp)) ec = err_code::bad_plus; - const size_t lim = min(s1.length(), s2.length()); + if (!getline(s, r.qual)) ec = err_code::bad_qual; - size_t count = 0; - for (size_t i = 0; i < lim; ++i) - if (similar_letters_bisulfite_tc_and_ag(s1[i], s2[i])) - count++; + if (ec != err_code::none) throw error_msg[ec]; - return count; + return s; } +static inline bool +similar_letters_bisulfite_tc_and_ag(const char a, const char b) { + return (a != 'N' && a == b) || (a == 'T' && b == 'C') || + (a == 'G' && b == 'A'); +} +// compare two reads to detect the overlapped region static double -check_hairpins(const size_t name_suffix_len, - const size_t reads_to_check, const double cutoff, - const string &reads_file1, const string &reads_file2) { - - // Input: paired-end reads with end1 and end2 - std::ifstream in1(reads_file1); - if (!in1) - throw runtime_error("cannot open input file: " + reads_file1); - - std::ifstream in2(reads_file2); - if (!in2) - throw runtime_error("cannot open input file: " + reads_file2); - - size_t n_reads = 0; - size_t n_bad_reads = 0; - - FASTQRecord end_one, end_two; - while (n_reads < reads_to_check && in1 >> end_one && in2 >> end_two) { - ++n_reads; - // two reads should be in paired-ends - if (!mates(name_suffix_len, end_one, end_two)) - throw runtime_error("expected mates, got:\n" + - end_one.tostring() + "\nand:\n" + - end_two.tostring()); - - // See if inverted duplicates emerge - const size_t sim = - similarity_both_bisulfite_convsersions(end_one.seq, end_two.seq); - - const double min_len = min(end_one.seq.length(), end_two.seq.length()); - const double percent_match = sim/min_len; +similarity_both_bisulfite_convsersions(const string &s1, const string &s2) { + const size_t lim = min(size(s1), size(s2)); - if (percent_match > cutoff) - ++n_bad_reads; + uint32_t total_letters = 0; + uint32_t matching_letters = 0; + for (size_t i = 0; i < lim; ++i) { + matching_letters += (similar_letters_bisulfite_tc_and_ag(s1[i], s2[i])); + total_letters += (valid_base(s1[i]) && valid_base(s2[i])); } - return static_cast(n_bad_reads)/n_reads; + return static_cast(matching_letters) / total_letters; } struct hp_summary { + explicit hp_summary(const double cutoff) : cutoff{cutoff} {} + // n_reads is the total number of read pairs in the input fastq // files. uint64_t n_reads{}; + // n_good_reads is the total number of read pairs that together have + // a fraction of good bases above the minimum specified. + uint64_t n_good_reads{}; + // n_hairpin_reads is the number of read pairs identified as being // hairpins using the criteria in the "cutoff" variable. uint64_t n_hairpin_reads{}; @@ -183,64 +171,188 @@ struct hp_summary { // sum_percent_match_bad over the total hairpin reads. double mean_percent_match_hairpin{}; - auto assign_values() -> void { + auto + assign_values() -> void { mean_percent_match_non_hairpin = sum_percent_match_good / (n_reads - n_hairpin_reads); mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads; } - auto tostring() const -> string { - std::ostringstream oss; + auto + tostring() const -> string { + ostringstream oss; oss << "total_reads_pairs: " << n_reads << '\n' + << "good_reads_pairs: " << n_good_reads << '\n' << "hairpin_read_pairs: " << n_hairpin_reads << '\n' << "hairpin_cutoff: " << cutoff << '\n' << "sum_percent_match_good: " << sum_percent_match_good << '\n' - << "mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin << '\n' + << "mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin + << '\n' << "sum_percent_match_bad: " << sum_percent_match_bad << '\n' << "mean_percent_match_hairpin: " << mean_percent_match_hairpin << '\n'; return oss.str(); } }; +static void +write_histogram(const string &hist_outfile, vector hist) { + ofstream hist_out(hist_outfile); + if (!hist_out) throw runtime_error("failed to open file: " + hist_outfile); + const auto total = accumulate(cbegin(hist), cend(hist), 0.0); + transform(cbegin(hist), cend(hist), begin(hist), + [&total](const double t) { return t / total; }); + const double increment = 1.0 / size(hist); + auto idx = 0; + hist_out << std::fixed; + hist_out.precision(3); + for (double offset = 0.0; offset < 1.0; offset += increment) + hist_out << offset << '\t' << hist[idx++] << '\n'; +} + +static void +write_statistics(const string &filename, hp_summary hps) { + hps.assign_values(); + ofstream out(filename); + if (!out) throw runtime_error("failed to open file: " + filename); + out << hps.tostring(); +} + +static inline double +fraction_good_bases(const FASTQRecord &a, const FASTQRecord &b) { + const double a_bases = count_if(cbegin(a.seq), cend(a.seq), &valid_base); + const double b_bases = count_if(cbegin(b.seq), cend(b.seq), &valid_base); + return (a_bases + b_bases) / (size(a.seq) + size(b.seq)); +} + +struct clean_hairpin { + double cutoff{0.95}; + uint64_t n_reads_to_check{std::numeric_limits::max()}; + double min_good_base_percent{0.5}; + uint32_t min_read_length{32}; + uint32_t n_hist_bins{20}; + bool invert_output{false}; + + hp_summary + analyze_reads(const string &outfile1, const string &outfile2, bgzf_file &in1, + bgzf_file &in2, vector &hist) const; + hp_summary + analyze_reads(bgzf_file &in1, bgzf_file &in2, vector &hist) const; +}; + +hp_summary +clean_hairpin::analyze_reads(const string &outfile1, const string &outfile2, + bgzf_file &in1, bgzf_file &in2, + vector &hist) const { + + // output files for read1 and read2 with hairpins removed + bgzf_file out1(outfile1, "w"); + if (!out1) throw runtime_error("cannot open output file: " + outfile1); + bgzf_file out2(outfile2, "w"); + if (!out2) throw runtime_error("cannot open output file: " + outfile2); + + hp_summary hps{cutoff}; + FASTQRecord r1, r2; + while (hps.n_reads < n_reads_to_check && in1 >> r1 && in2 >> r2) { + ++hps.n_reads; + + if (fraction_good_bases(r1, r2) < min_good_base_percent || + size(r1.seq) < min_read_length || size(r2.seq) < min_read_length) + continue; + + ++hps.n_good_reads; + + // See if inverted duplicates emerge + const double percent_match = + similarity_both_bisulfite_convsersions(r1.seq, r2.seq); + + // ADS: need a bitter way to get this bin identifier + ++hist[floor(percent_match * n_hist_bins)]; + + if (percent_match > cutoff) { + hps.sum_percent_match_bad += percent_match; + hps.n_hairpin_reads++; + if (invert_output) { + out1 << r1; + out2 << r2; + } + } + else { + hps.sum_percent_match_good += percent_match; + if (!invert_output) { + out1 << r1; + out2 << r2; + } + } + } + return hps; +} + +hp_summary +clean_hairpin::analyze_reads(bgzf_file &in1, bgzf_file &in2, + vector &hist) const { + hp_summary hps{cutoff}; + FASTQRecord r1, r2; + while (hps.n_reads < n_reads_to_check && in1 >> r1 && in2 >> r2) { + ++hps.n_reads; + + if (fraction_good_bases(r1, r2) < min_good_base_percent || + size(r1.seq) < min_read_length || size(r2.seq) < min_read_length) + continue; + + ++hps.n_good_reads; + + const double percent_match = + similarity_both_bisulfite_convsersions(r1.seq, r2.seq); + ++hist[floor(percent_match * n_hist_bins)]; + + if (percent_match > cutoff) { + hps.sum_percent_match_bad += percent_match; + hps.n_hairpin_reads++; + } + else + hps.sum_percent_match_good += percent_match; + } + return hps; +} + int main_clean_hairpins(int argc, const char **argv) { + + static const string description = "fix and stat invdup/hairping reads"; + try { - string outfile; + string outfile1; + string outfile2; string stat_outfile; string hist_outfile; - double cutoff = 0.95; - size_t name_suffix_len = 0; + clean_hairpin ch{}; - size_t reads_to_check = 1000000; - double max_hairpin_rate = 0.1; - - bool VERBOSE = false; - bool check_first = false; + bool verbose = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), - "fix and stat invdup/hairping reads", - " "); - opt_parse.add_opt("output", 'o', "output filename", true, outfile); - opt_parse.add_opt("stat", 's', "stats output filename", true, stat_outfile); - opt_parse.add_opt("hairpin", 'H', "max hairpin rate", false, - max_hairpin_rate); - opt_parse.add_opt("check", '\0', "check for hairpin contamination", false, - check_first); - opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false, - reads_to_check); - opt_parse.add_opt("cutoff", 'c', - "cutoff for calling an invdup (default: 0.95)", false, - cutoff); - opt_parse.add_opt("ignore", 'i', - "length of read name suffix " - "to ignore when matching", - false, name_suffix_len); - opt_parse.add_opt("hist", '\0', "write a histogram of hairpin matches here", - true, hist_outfile); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + OptionParser opt_parse(strip_path(argv[0]), description, + " ", true); + opt_parse.set_show_defaults(); + opt_parse.add_opt("out1", 'o', "output file for read 1", false, outfile1); + opt_parse.add_opt("out2", 'p', "output file for read 2", false, outfile2); + opt_parse.add_opt("stat", 's', "stats output file", false, stat_outfile); + opt_parse.add_opt("nreads", 'n', "number of reads to process", false, + ch.n_reads_to_check); + opt_parse.add_opt("cutoff", 'c', "min percent id to call a hairpin", false, + ch.cutoff); + opt_parse.add_opt("good-bases", 'g', "min percent non-N to consider", false, + ch.min_good_base_percent); + opt_parse.add_opt("length", 'l', "min read length to consider", false, + ch.min_read_length); + opt_parse.add_opt("invert", 'I', "output hairpins instead good pairs", + false, ch.invert_output); + opt_parse.add_opt("hist", 'H', "write a histogram of hairpin matches here", + false, hist_outfile); + opt_parse.add_opt("bins", 'b', "number of histograms bins", false, + ch.n_hist_bins); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { @@ -260,104 +372,33 @@ main_clean_hairpins(int argc, const char **argv) { const string reads_file2(leftover_args.back()); /****************** END COMMAND LINE OPTIONS *****************/ - double hairpin_fraction = 0.0; - if (check_first) { - if (VERBOSE) - cerr << "testing for hairpin contamination" << endl; - - hairpin_fraction = - check_hairpins(name_suffix_len, - reads_to_check, cutoff, reads_file1, reads_file2); - - if (VERBOSE) { - if (hairpin_fraction > max_hairpin_rate) - cerr << "hairpin problem detected" << endl; - else - cerr << "no hairpin problem detected" << endl; - } + if (outfile1.empty() != outfile2.empty()) { + // ADS: add message about number of reads to check and output files + cerr << "error: specify both or neither of out1/o and out2/p" << endl; + return EXIT_FAILURE; } - hp_summary hps; - hps.cutoff = cutoff; - vector hist(20, 0.0); - - if (!check_first || hairpin_fraction > max_hairpin_rate) { - - // Input: paired-end reads with end1 and end2 - std::ifstream in1(reads_file1); - if (!in1) - throw runtime_error("cannot open input file: " + reads_file1); - - std::ifstream in2(reads_file2); - if (!in2) - throw runtime_error("cannot open input file: " + reads_file2); - - // output read2 with hairpins removed - std::ofstream out(outfile); - if (!out) - throw runtime_error("cannot open output file: " + outfile); + const bool write_reads = !outfile1.empty(); - FASTQRecord end_one, end_two; - while (in1 >> end_one && in2 >> end_two) { - hps.n_reads++; + vector hist(ch.n_hist_bins, 0.0); - // two reads should be in paired-ends - if (!mates(name_suffix_len, end_one, end_two)) - throw runtime_error("expected mates, got:\n" + - end_one.tostring() + "\nand:\n" + - end_two.tostring()); + // Input: paired-end reads with end1 and end2 + bgzf_file in1(reads_file1, "r"); + if (!in1) throw runtime_error("cannot open input file: " + reads_file1); + bgzf_file in2(reads_file2, "r"); + if (!in2) throw runtime_error("cannot open input file: " + reads_file2); - // See if inverted duplicates emerge - const size_t sim = - similarity_both_bisulfite_convsersions(end_one.seq, end_two.seq); + const hp_summary hps = + write_reads ? ch.analyze_reads(outfile1, outfile2, in1, in2, hist) + : ch.analyze_reads(in1, in2, hist); - const double min_len = min(end_one.seq.length(), end_two.seq.length()); - const double percent_match = sim/min_len; + if (!stat_outfile.empty()) write_statistics(stat_outfile, hps); - // ADS: need a bitter way to get this bin identifier - const int hist_bin = hist.size()*((sim - 0.001)/(min_len + 0.001)); - hist[hist_bin]++; - - if (percent_match > cutoff) { - - hps.sum_percent_match_bad += percent_match; - hps.n_hairpin_reads++; - - end_two.seq.clear(); - end_two.score.clear(); - } - else - hps.sum_percent_match_good += percent_match; - - out << end_two << '\n'; - } - - std::ofstream stat_out(stat_outfile); - if (!stat_out) - throw runtime_error("failed to open file: " + stat_outfile); - hps.assign_values(); - stat_out << hps.tostring(); - - if (!hist_outfile.empty()) { - std::ofstream hist_out(hist_outfile); - if (!hist_out) - throw runtime_error("failed to open file: " + hist_outfile); - const auto total = accumulate(cbegin(hist), cend(hist), 0.0); - transform(cbegin(hist), cend(hist), begin(hist), - [&total](const double t) { return t / total; }); - hist_out.precision(3); - for (auto i = 0u; i < std::size(hist); ++i) - hist_out << i << '\t' << hist[i] << '\n'; - } - } + if (!hist_outfile.empty()) write_histogram(hist_outfile, hist); } - catch (const runtime_error &e) { + catch (const std::exception &e) { cerr << e.what() << endl; return EXIT_FAILURE; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; - return EXIT_FAILURE; - } return EXIT_SUCCESS; }