From 5b9b71f9ae74df286bffb42c2d3781199856c1a8 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 23 Jun 2024 10:10:32 -0700 Subject: [PATCH 1/3] Putting guards around parts of source that should not be formatted --- dna_four_bit.cpp | 2 ++ smithlab_utils.cpp | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dna_four_bit.cpp b/dna_four_bit.cpp index 199770e..2b75bc5 100644 --- a/dna_four_bit.cpp +++ b/dna_four_bit.cpp @@ -16,6 +16,7 @@ #include "dna_four_bit.hpp" +// clang-format off char dna_four_bit_decoding[] = { 'Z', // = 0000 = 0 = {} = Zero bases 'A', // = 0001 = 1 = {A} = Adenine @@ -76,3 +77,4 @@ uint8_t dna_four_bit_encoding[] = { }; // . A B C D . . G H . . K . M N . // . . R S T . V W . Y Z +// clang-format on diff --git a/smithlab_utils.cpp b/smithlab_utils.cpp index 69f9c91..edd1e2b 100644 --- a/smithlab_utils.cpp +++ b/smithlab_utils.cpp @@ -82,14 +82,16 @@ smithlab::get_fdr_cutoff(const size_t n_tests, vector &pvals, char complement(int i) { static const int b2c_size = 20; + // clang-format off static const char b2c[] = { //A, b, C, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, T - 'T','N','G','N','N','N','C','N','N','N','N','N','N','N','N','N','N','N','N','A' + 'T','N','G','N','N','N','C','N','N','N','N','N','N','N','N','N','N','N','N','A' }; static const char b2cl[] = { //A, b, C, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, T - 't','n','g','n','n','n','c','n','n','n','n','n','n','n','n','n','n','n','n','a' + 't','n','g','n','n','n','c','n','n','n','n','n','n','n','n','n','n','n','n','a' }; + // clang-format on if (i - 'A' >= 0 && i - 'A' < b2c_size) return b2c[i - 'A']; else if (i - 'a' >= 0 && i - 'a' < b2c_size) @@ -236,6 +238,7 @@ ProgressBar::report(std::ostream &out, const size_t i) { out << '\n'; } +// clang-format off char to_valid_five_letter[] = { /* 0*/ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', /* 15*/ /* 16*/ 'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N', /* 31*/ @@ -246,3 +249,4 @@ char to_valid_five_letter[] = { /* 96*/ 'N','A','N','C','N','N','N','G','N','N','N','N','N','N','N','N', /*111*/ /*112*/ 'N','N','N','N','T','N','N','N','N','N','N','N','N','N','N','N' /*127*/ }; +// clang-format on From 505d18cbb9dece9daf9705ff6009271b07a3552b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 23 Jun 2024 10:12:03 -0700 Subject: [PATCH 2/3] Running clang-format on all c++ source files --- GenomicRegion.cpp | 246 ++++++++++------------- GenomicRegion.hpp | 291 +++++++++++++-------------- MappedRead.cpp | 14 +- MappedRead.hpp | 6 +- OptionParser.cpp | 288 +++++++++++++-------------- OptionParser.hpp | 76 ++++---- QualityScore.cpp | 19 +- QualityScore.hpp | 136 +++++-------- bisulfite_utils.cpp | 7 +- bisulfite_utils.hpp | 32 ++- chromosome_utils.cpp | 129 ++++++------ chromosome_utils.hpp | 73 +++---- cigar_utils.cpp | 14 +- cigar_utils.hpp | 138 +++++-------- dna_four_bit.hpp | 54 +++-- htslib_wrapper.cpp | 20 +- htslib_wrapper.hpp | 6 +- sam_record.cpp | 69 +++---- sam_record.hpp | 142 ++++++-------- smithlab_os.cpp | 210 +++++++++----------- smithlab_os.hpp | 131 +++++-------- smithlab_utils.cpp | 88 ++++----- smithlab_utils.hpp | 456 +++++++++++++++++++++++-------------------- zlib_wrapper.cpp | 15 +- zlib_wrapper.hpp | 41 ++-- 25 files changed, 1216 insertions(+), 1485 deletions(-) diff --git a/GenomicRegion.cpp b/GenomicRegion.cpp index a839ebb..8922142 100644 --- a/GenomicRegion.cpp +++ b/GenomicRegion.cpp @@ -28,17 +28,16 @@ #include #include -using std::string; -using std::vector; using std::ostringstream; -using std::unordered_map; using std::runtime_error; +using std::string; +using std::unordered_map; +using std::vector; unordered_map SimpleGenomicRegion::fw_table_in; unordered_map SimpleGenomicRegion::fw_table_out; -chrom_id_type -SimpleGenomicRegion::assign_chrom(const std::string &c) { +chrom_id_type SimpleGenomicRegion::assign_chrom(const std::string &c) { auto chr_id(fw_table_in.find(c)); if (chr_id == fw_table_in.end()) { const chrom_id_type r = fw_table_in.size(); @@ -50,104 +49,90 @@ SimpleGenomicRegion::assign_chrom(const std::string &c) { return fw_table_in[c]; } - -string -SimpleGenomicRegion::retrieve_chrom(chrom_id_type i) { +string SimpleGenomicRegion::retrieve_chrom(chrom_id_type i) { auto chr_name(fw_table_out.find(i)); // assert(chr_name != fw_table_out.end()); return chr_name->second; } - -SimpleGenomicRegion::SimpleGenomicRegion(const GenomicRegion &r) : - chrom(assign_chrom(r.get_chrom())), start(r.get_start()), end(r.get_end()) {} +SimpleGenomicRegion::SimpleGenomicRegion(const GenomicRegion &r) + : chrom(assign_chrom(r.get_chrom())), start(r.get_start()), + end(r.get_end()) {} SimpleGenomicRegion::SimpleGenomicRegion(const char *s, const size_t len) { size_t i = 0; // the chrom - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; size_t j = i; - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; chrom = assign_chrom(string(s + j, i - j)); // start of the region (a positive integer) - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; j = i; - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; start = atoi(s + j); // end of the region (a positive integer) - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; j = i; - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; end = atoi(s + j); } -string -SimpleGenomicRegion::tostring() const { +string SimpleGenomicRegion::tostring() const { std::ostringstream s; s << retrieve_chrom(chrom) << "\t" << start << "\t" << end; return s.str(); } - -bool -SimpleGenomicRegion::contains(const SimpleGenomicRegion& other) const { +bool SimpleGenomicRegion::contains(const SimpleGenomicRegion &other) const { return chrom == other.chrom && start <= other.start && other.end <= end; } - -bool -SimpleGenomicRegion::overlaps(const SimpleGenomicRegion& other) const { +bool SimpleGenomicRegion::overlaps(const SimpleGenomicRegion &other) const { return chrom == other.chrom && - ((start < other.end && other.end <= end) || - (start <= other.start && other.start < end) || - other.contains(*this)); + ((start < other.end && other.end <= end) || + (start <= other.start && other.start < end) || other.contains(*this)); } - -size_t -SimpleGenomicRegion::distance(const SimpleGenomicRegion& other) const { +size_t SimpleGenomicRegion::distance(const SimpleGenomicRegion &other) const { if (chrom != other.chrom) return std::numeric_limits::max(); else if (overlaps(other) || other.overlaps(*this)) return 0; - else return (end < other.start) ? - other.start - end + 1 : start - other.end + 1; + else + return (end < other.start) ? other.start - end + 1 : start - other.end + 1; } - -bool -SimpleGenomicRegion::operator<(const SimpleGenomicRegion& rhs) const { +bool SimpleGenomicRegion::operator<(const SimpleGenomicRegion &rhs) const { return (get_chrom() < rhs.get_chrom() || (chrom == rhs.chrom && - (start < rhs.start || - (start == rhs.start && (end < rhs.end))))); + (start < rhs.start || (start == rhs.start && (end < rhs.end))))); } -bool -SimpleGenomicRegion::less1(const SimpleGenomicRegion& rhs) const { +bool SimpleGenomicRegion::less1(const SimpleGenomicRegion &rhs) const { return (get_chrom() < rhs.get_chrom() || (chrom == rhs.chrom && - (end < rhs.end || - (end == rhs.end && start < rhs.start)))); + (end < rhs.end || (end == rhs.end && start < rhs.start)))); } -bool -SimpleGenomicRegion::operator<=(const SimpleGenomicRegion& rhs) const { +bool SimpleGenomicRegion::operator<=(const SimpleGenomicRegion &rhs) const { return !(rhs < *this); } - -bool -SimpleGenomicRegion::operator==(const SimpleGenomicRegion& rhs) const { +bool SimpleGenomicRegion::operator==(const SimpleGenomicRegion &rhs) const { return (chrom == rhs.chrom && start == rhs.start && end == rhs.end); } - -bool -SimpleGenomicRegion::operator!=(const SimpleGenomicRegion& rhs) const { +bool SimpleGenomicRegion::operator!=(const SimpleGenomicRegion &rhs) const { return (chrom != rhs.chrom || start != rhs.start || end != rhs.end); } @@ -156,8 +141,7 @@ SimpleGenomicRegion::operator!=(const SimpleGenomicRegion& rhs) const { unordered_map GenomicRegion::fw_table_in; unordered_map GenomicRegion::fw_table_out; -chrom_id_type -GenomicRegion::assign_chrom(const std::string &c) { +chrom_id_type GenomicRegion::assign_chrom(const std::string &c) { auto chr_id(fw_table_in.find(c)); if (chr_id == fw_table_in.end()) { const chrom_id_type r = fw_table_in.size(); @@ -165,78 +149,89 @@ GenomicRegion::assign_chrom(const std::string &c) { fw_table_out[r] = c; return r; } - else return chr_id->second; + else + return chr_id->second; } -string -GenomicRegion::retrieve_chrom(chrom_id_type i) { - unordered_map::const_iterator chr_name(fw_table_out.find(i)); +string GenomicRegion::retrieve_chrom(chrom_id_type i) { + unordered_map::const_iterator chr_name( + fw_table_out.find(i)); assert(chr_name != fw_table_out.end()); return chr_name->second; } - GenomicRegion::GenomicRegion(const char *s, const size_t len) { size_t i = 0; // the chrom - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; if (i == len) throw runtime_error( "malformatted BED file contains only one" "column in the line below " - "(a properly formatted BED file must contain at least three):\n" - + string(s)); + "(a properly formatted BED file must contain at least three):\n" + + string(s)); size_t j = i; - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; if (i == len) throw runtime_error( "malformatted BED file contains only two " "columns in the line below " - "(a properly formatted BED file must contain at least three):\n" - + string(s)); + "(a properly formatted BED file must contain at least three):\n" + + string(s)); chrom = assign_chrom(string(s + j, i - j)); // start of the region (a positive integer) - while (isspace(s[i]) && i < len) ++i; - + while (isspace(s[i]) && i < len) + ++i; j = i; start = atoi(s + j); - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; // end of the region (a positive integer) - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; j = i; end = atoi(s + j); - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; // name of the region - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; j = i; - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; name = string(s + j, i - j); // score of the region (floating point) - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; j = i; score = atof(s + j); - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; // strand - while (isspace(s[i]) && i < len) ++i; + while (isspace(s[i]) && i < len) + ++i; j = i; strand = *(s + j); - while (!isspace(s[i]) && i < len) ++i; + while (!isspace(s[i]) && i < len) + ++i; // ADS: This is a hack!!! - if (strand != '-') strand = '+'; + if (strand != '-') + strand = '+'; } -string -GenomicRegion::tostring() const { +string GenomicRegion::tostring() const { std::ostringstream s; s << retrieve_chrom(chrom) << "\t" << start << "\t" << end; if (!name.empty()) @@ -244,85 +239,65 @@ GenomicRegion::tostring() const { return s.str(); } - -bool -GenomicRegion::contains(const GenomicRegion& other) const { +bool GenomicRegion::contains(const GenomicRegion &other) const { return chrom == other.chrom && start <= other.start && other.end <= end; } - -bool -GenomicRegion::overlaps(const GenomicRegion& other) const { +bool GenomicRegion::overlaps(const GenomicRegion &other) const { return chrom == other.chrom && - ((start < other.end && other.end <= end) || - (start <= other.start && other.start < end) || - other.contains(*this)); + ((start < other.end && other.end <= end) || + (start <= other.start && other.start < end) || other.contains(*this)); } - -size_t -GenomicRegion::distance(const GenomicRegion& other) const { +size_t GenomicRegion::distance(const GenomicRegion &other) const { if (chrom != other.chrom) return std::numeric_limits::max(); else if (overlaps(other) || other.overlaps(*this)) return 0; - else return (end < other.start) ? - other.start - end + 1 : start - other.end + 1; + else + return (end < other.start) ? other.start - end + 1 : start - other.end + 1; } - -bool -GenomicRegion::operator<(const GenomicRegion& rhs) const { +bool GenomicRegion::operator<(const GenomicRegion &rhs) const { return ((chrom == rhs.chrom && (start < rhs.start || (start == rhs.start && - (end < rhs.end || - (end == rhs.end && - (strand < rhs.strand - // || (strand == rhs.strand && name < rhs.name) - )))))) || + (end < rhs.end || (end == rhs.end && + (strand < rhs.strand + // || (strand == rhs.strand && name < rhs.name) + )))))) || get_chrom() < rhs.get_chrom()); } - -bool -GenomicRegion::less1(const GenomicRegion& rhs) const { +bool GenomicRegion::less1(const GenomicRegion &rhs) const { return ((chrom == rhs.chrom && - (end < rhs.end || - (end == rhs.end && - (start < rhs.start || - (start == rhs.start && - (strand < rhs.strand - // || (strand == rhs.strand && name < rhs.name) - )))))) || + (end < rhs.end || (end == rhs.end && + (start < rhs.start || + (start == rhs.start && + (strand < rhs.strand + // || (strand == rhs.strand && name < rhs.name) + )))))) || get_chrom() < rhs.get_chrom()); } - -bool -GenomicRegion::operator<=(const GenomicRegion& rhs) const { +bool GenomicRegion::operator<=(const GenomicRegion &rhs) const { return !(rhs < *this); } - -bool -GenomicRegion::operator==(const GenomicRegion& rhs) const { +bool GenomicRegion::operator==(const GenomicRegion &rhs) const { return (chrom == rhs.chrom && start == rhs.start && end == rhs.end && name == rhs.name && score == rhs.score && strand == rhs.strand); } - -bool -GenomicRegion::operator!=(const GenomicRegion& rhs) const { +bool GenomicRegion::operator!=(const GenomicRegion &rhs) const { return (chrom != rhs.chrom || start != rhs.start || end != rhs.end || name != rhs.name || score != rhs.score || strand != rhs.strand); } - -void -separate_chromosomes(const vector& regions, - vector >& separated_by_chrom) { - typedef unordered_map > Separator; +void separate_chromosomes( + const vector ®ions, + vector> &separated_by_chrom) { + typedef unordered_map> Separator; Separator separator; for (vector::const_iterator i = regions.begin(); i != regions.end(); ++i) { @@ -336,11 +311,9 @@ separate_chromosomes(const vector& regions, separated_by_chrom.push_back(i->second); } - -void -separate_chromosomes(const vector& regions, - vector >& separated_by_chrom) { - typedef unordered_map > Separator; +void separate_chromosomes(const vector ®ions, + vector> &separated_by_chrom) { + typedef unordered_map> Separator; Separator separator; for (vector::const_iterator i = regions.begin(); i != regions.end(); ++i) { @@ -354,9 +327,7 @@ separate_chromosomes(const vector& regions, separated_by_chrom.push_back(i->second); } - -static inline auto -is_header_line(const string& line) -> bool { +static inline auto is_header_line(const string &line) -> bool { static constexpr auto browser_label = "browser"; static constexpr auto browser_label_len = 7u; for (auto i = 0u; i < browser_label_len; ++i) @@ -365,9 +336,7 @@ is_header_line(const string& line) -> bool { return true; } - -static inline auto -is_track_line(const string &line) -> bool { +static inline auto is_track_line(const string &line) -> bool { static constexpr auto track_label = "track"; static constexpr auto track_label_len = 5u; for (auto i = 0u; i < track_label_len; ++i) @@ -376,9 +345,7 @@ is_track_line(const string &line) -> bool { return true; } - -void -ReadBEDFile(const string &filename, vector &the_regions) { +void ReadBEDFile(const string &filename, vector &the_regions) { std::ifstream in(filename); if (!in) throw runtime_error("failed to open file " + filename); @@ -389,9 +356,8 @@ ReadBEDFile(const string &filename, vector &the_regions) { the_regions.push_back(GenomicRegion(line)); } - -void -ReadBEDFile(const string &filename, vector &the_regions) { +void ReadBEDFile(const string &filename, + vector &the_regions) { std::ifstream in(filename); if (isdir(filename.c_str())) throw runtime_error("BED file is a directory: " + filename); diff --git a/GenomicRegion.hpp b/GenomicRegion.hpp index fab1350..f138cf3 100644 --- a/GenomicRegion.hpp +++ b/GenomicRegion.hpp @@ -24,14 +24,14 @@ #ifndef GENOMIC_REGION_HPP #define GENOMIC_REGION_HPP -#include "smithlab_utils.hpp" #include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -#include -#include #include -#include #include +#include +#include +#include typedef unsigned chrom_id_type; @@ -47,27 +47,27 @@ class SimpleGenomicRegion { } // Other constructors - SimpleGenomicRegion(std::string c, size_t sta, size_t e) : - chrom(assign_chrom(c)), start(sta), end(e) {} + SimpleGenomicRegion(std::string c, size_t sta, size_t e) + : chrom(assign_chrom(c)), start(sta), end(e) {} SimpleGenomicRegion(const GenomicRegion &rhs); SimpleGenomicRegion(const char *string_representation, const size_t len); - explicit SimpleGenomicRegion(const std::string &line) : - SimpleGenomicRegion(line.c_str(), line.length()) {} + explicit SimpleGenomicRegion(const std::string &line) + : SimpleGenomicRegion(line.c_str(), line.length()) {} std::string tostring() const; // accessors - std::string get_chrom() const {return retrieve_chrom(chrom);} - size_t get_start() const {return start;} - size_t get_end() const {return end;} - size_t get_width() const {return (end > start) ? end - start : 0;} + std::string get_chrom() const { return retrieve_chrom(chrom); } + size_t get_start() const { return start; } + size_t get_end() const { return end; } + size_t get_width() const { return (end > start) ? end - start : 0; } // mutators void set_chrom(const std::string &new_chrom) { chrom = assign_chrom(new_chrom); } - void set_start(size_t new_start) {start = new_start;} - void set_end(size_t new_end) {end = new_end;} + void set_start(size_t new_start) { start = new_start; } + void set_end(size_t new_end) { end = new_end; } // comparison functions bool contains(const SimpleGenomicRegion &other) const; @@ -83,12 +83,11 @@ class SimpleGenomicRegion { return chrom == other.chrom; } - friend void - separate_chromosomes(const std::vector ®ions, - std::vector > - &separated_by_chrom); -private: + friend void separate_chromosomes( + const std::vector ®ions, + std::vector> &separated_by_chrom); +private: static chrom_id_type assign_chrom(const std::string &c); static std::string retrieve_chrom(chrom_id_type i); @@ -101,8 +100,7 @@ class SimpleGenomicRegion { size_t end; }; -template T& -operator>>(T &the_stream, SimpleGenomicRegion &r) { +template T &operator>>(T &the_stream, SimpleGenomicRegion &r) { std::string buffer; if (getline(the_stream, buffer)) { r = SimpleGenomicRegion(buffer); @@ -110,16 +108,16 @@ operator>>(T &the_stream, SimpleGenomicRegion &r) { return the_stream; } -template T& -operator<<(T &the_stream, const SimpleGenomicRegion &r) { +template T &operator<<(T &the_stream, const SimpleGenomicRegion &r) { the_stream << r.tostring(); return the_stream; } class GenomicRegion { public: - GenomicRegion() : chrom(assign_chrom("(NULL)")), - name("X"), start(0), end(0), score(0), strand('+') {} + GenomicRegion() + : chrom(assign_chrom("(NULL)")), name("X"), start(0), end(0), score(0), + strand('+') {} void swap(GenomicRegion &rhs) { std::swap(chrom, rhs.chrom); std::swap(name, rhs.name); @@ -130,38 +128,41 @@ class GenomicRegion { } // Other constructors - GenomicRegion(std::string c, size_t sta, size_t e, - std::string n, float sc, char str) : - chrom(assign_chrom(c)), name(n), start(sta), end(e), score(sc), strand(str) {} - GenomicRegion(std::string c, size_t sta, size_t e) : - chrom(assign_chrom(c)), name(std::string("X")), - start(sta), end(e), score(0.0), strand('+') {} + GenomicRegion(std::string c, size_t sta, size_t e, std::string n, float sc, + char str) + : chrom(assign_chrom(c)), name(n), start(sta), end(e), score(sc), + strand(str) {} + GenomicRegion(std::string c, size_t sta, size_t e) + : chrom(assign_chrom(c)), name(std::string("X")), start(sta), end(e), + score(0.0), strand('+') {} GenomicRegion(const char *s, const size_t len); - explicit GenomicRegion(const std::string &line) : - GenomicRegion(line.c_str(), line.length()) {} - GenomicRegion(const SimpleGenomicRegion &other) : - chrom(assign_chrom(other.get_chrom())), name("(NULL)"), - start(other.get_start()), end(other.get_end()), score(0), strand('+') {} + explicit GenomicRegion(const std::string &line) + : GenomicRegion(line.c_str(), line.length()) {} + GenomicRegion(const SimpleGenomicRegion &other) + : chrom(assign_chrom(other.get_chrom())), name("(NULL)"), + start(other.get_start()), end(other.get_end()), score(0), strand('+') {} std::string tostring() const; // accessors - std::string get_chrom() const {return retrieve_chrom(chrom);} - size_t get_start() const {return start;} - size_t get_end() const {return end;} - size_t get_width() const {return (end > start) ? end - start : 0;} - std::string get_name() const {return name;} - float get_score() const {return score;} - char get_strand() const {return strand;} - bool pos_strand() const {return (strand == '+');} - bool neg_strand() const {return (strand == '-');} + std::string get_chrom() const { return retrieve_chrom(chrom); } + size_t get_start() const { return start; } + size_t get_end() const { return end; } + size_t get_width() const { return (end > start) ? end - start : 0; } + std::string get_name() const { return name; } + float get_score() const { return score; } + char get_strand() const { return strand; } + bool pos_strand() const { return (strand == '+'); } + bool neg_strand() const { return (strand == '-'); } // mutators - void set_chrom(const std::string &new_chrom) {chrom = assign_chrom(new_chrom);} - void set_start(size_t new_start) {start = new_start;} - void set_end(size_t new_end) {end = new_end;} - void set_name(const std::string &n) {name = n;} - void set_score(float s) {score = s;} - void set_strand(char s) {strand = s;} + void set_chrom(const std::string &new_chrom) { + chrom = assign_chrom(new_chrom); + } + void set_start(size_t new_start) { start = new_start; } + void set_end(size_t new_end) { end = new_end; } + void set_name(const std::string &n) { name = n; } + void set_score(float s) { score = s; } + void set_strand(char s) { strand = s; } // comparison functions bool contains(const GenomicRegion &other) const; @@ -177,13 +178,11 @@ class GenomicRegion { return chrom == other.chrom; } - friend void - separate_chromosomes(const std::vector ®ions, - std::vector > - &separated_by_chrom); + friend void separate_chromosomes( + const std::vector ®ions, + std::vector> &separated_by_chrom); private: - static chrom_id_type assign_chrom(const std::string &c); static std::string retrieve_chrom(chrom_id_type i); @@ -199,21 +198,14 @@ class GenomicRegion { char strand; }; - -template -bool -score_less(const T &a, const T &b) { +template bool score_less(const T &a, const T &b) { return a.get_score() < b.get_score(); } -template -bool -score_greater(const T &a, const T &b) { +template bool score_greater(const T &a, const T &b) { return a.get_score() > b.get_score(); } - -template T& -operator>>(T &the_stream, GenomicRegion &r) { +template T &operator>>(T &the_stream, GenomicRegion &r) { std::string buffer; if (getline(the_stream, buffer)) { r = GenomicRegion(buffer); @@ -221,26 +213,23 @@ operator>>(T &the_stream, GenomicRegion &r) { return the_stream; } -template T& -operator<<(T &the_stream, const GenomicRegion &r) { +template T &operator<<(T &the_stream, const GenomicRegion &r) { the_stream << r.tostring(); return the_stream; } - template -void -sync_chroms(const std::vector > &stable, - std::vector > &to_sync) { +void sync_chroms(const std::vector> &stable, + std::vector> &to_sync) { std::unordered_map chrom_index; for (size_t i = 0; i < stable.size(); ++i) if (!stable[i].empty()) chrom_index[stable[i].front().get_chrom()] = i; - std::vector > syncd(stable.size()); + std::vector> syncd(stable.size()); for (size_t i = 0; i < to_sync.size(); ++i) { if (!to_sync[i].empty()) { std::unordered_map::const_iterator j = - chrom_index.find(to_sync[i].front().get_chrom()); + chrom_index.find(to_sync[i].front().get_chrom()); if (j != chrom_index.end()) to_sync[i].swap(syncd[j->second]); } @@ -248,11 +237,10 @@ sync_chroms(const std::vector > &stable, syncd.swap(to_sync); } - -template void -separate_regions(const std::vector &big_regions, - const std::vector ®ions, - std::vector > &sep_regions) { +template +void separate_regions(const std::vector &big_regions, + const std::vector ®ions, + std::vector> &sep_regions) { size_t rr_id = 0; const size_t n_regions = regions.size(); const size_t n_big_regions = big_regions.size(); @@ -261,81 +249,81 @@ separate_regions(const std::vector &big_regions, const std::string current_chrom(big_regions[i].get_chrom()); const size_t current_start = big_regions[i].get_start(); const size_t current_end = big_regions[i].get_end(); - while (rr_id < n_regions && - (regions[rr_id].get_chrom() < current_chrom || - (regions[rr_id].get_chrom() == current_chrom && - regions[rr_id].get_start() < current_start))) + while (rr_id < n_regions && (regions[rr_id].get_chrom() < current_chrom || + (regions[rr_id].get_chrom() == current_chrom && + regions[rr_id].get_start() < current_start))) ++rr_id; - while (rr_id < n_regions && - (regions[rr_id].get_chrom() == current_chrom && - regions[rr_id].get_start() < current_end)) { + while (rr_id < n_regions && (regions[rr_id].get_chrom() == current_chrom && + regions[rr_id].get_start() < current_end)) { sep_regions[i].push_back(regions[rr_id]); ++rr_id; } } } - -template bool -check_sorted(const std::vector ®ions) { +template bool check_sorted(const std::vector ®ions) { for (size_t i = 1; i < regions.size(); ++i) if (regions[i] < regions[i - 1]) return false; return true; } - template typename std::vector::const_iterator find_closest(const std::vector &targets, const T &query) { const auto closest = std::lower_bound(begin(targets), end(targets), query); - if (closest == begin(targets)) return closest; - if (closest == end(targets)) return (closest - 1); - return (query.distance(*closest) < query.distance(*(closest - 1))) ? - closest : (closest - 1); + if (closest == begin(targets)) + return closest; + if (closest == end(targets)) + return (closest - 1); + return (query.distance(*closest) < query.distance(*(closest - 1))) + ? closest + : (closest - 1); } - template -typename std::vector::iterator -find_closest(std::vector ®ions, const T ®ion) { +typename std::vector::iterator find_closest(std::vector ®ions, + const T ®ion) { typename std::vector::iterator closest = - lower_bound(regions.begin(), regions.end(), region); - if (closest == regions.begin()) return closest; - if (closest == regions.end()) return (closest - 1); - return (region.distance(*closest) < region.distance(*(closest - 1))) ? - closest : (closest - 1); + lower_bound(regions.begin(), regions.end(), region); + if (closest == regions.begin()) + return closest; + if (closest == regions.end()) + return (closest - 1); + return (region.distance(*closest) < region.distance(*(closest - 1))) + ? closest + : (closest - 1); } - -template void -collapse(std::vector ®ions) { +template void collapse(std::vector ®ions) { typename std::vector::iterator i, good = regions.begin(); for (i = regions.begin() + 1; i != regions.end(); ++i) if (i->overlaps(*good)) { good->set_start(std::min(i->get_start(), good->get_start())); good->set_end(std::max(i->get_end(), good->get_end())); } - else *(++good) = *i; + else + *(++good) = *i; regions.erase(++good, regions.end()); } - -template T -genomic_region_intersection(const T &a, const T &b) { - if (!a.overlaps(b)) return T(a.get_chrom(), 0, 0); - else if (a.contains(b)) return b; - else if (b.contains(a)) return a; - else if (a < b) return T(a.get_chrom(), b.get_start(), a.get_end()); - else return T(a.get_chrom(), a.get_start(), b.get_end()); +template T genomic_region_intersection(const T &a, const T &b) { + if (!a.overlaps(b)) + return T(a.get_chrom(), 0, 0); + else if (a.contains(b)) + return b; + else if (b.contains(a)) + return a; + else if (a < b) + return T(a.get_chrom(), b.get_start(), a.get_end()); + else + return T(a.get_chrom(), a.get_start(), b.get_end()); } - template -void -genomic_region_intersection(const std::vector ®ions_a, - const std::vector ®ions_b, - std::vector ®ions_c) { +void genomic_region_intersection(const std::vector ®ions_a, + const std::vector ®ions_b, + std::vector ®ions_c) { typename std::vector::const_iterator a(regions_a.begin()); typename std::vector::const_iterator a_lim(regions_a.end()); typename std::vector::const_iterator b(regions_b.begin()); @@ -343,18 +331,21 @@ genomic_region_intersection(const std::vector ®ions_a, while (a != a_lim && b != b_lim) { if (a->overlaps(*b)) regions_c.push_back(*b); - if (a == b) {++a; ++b;} - else if (*a < *b) ++a; - else ++b; // if (*b < *a) + if (a == b) { + ++a; + ++b; + } + else if (*a < *b) + ++a; + else + ++b; // if (*b < *a) } } - template -void -genomic_region_intersection_by_base(const std::vector ®ions_a, - const std::vector ®ions_b, - std::vector ®ions_c) { +void genomic_region_intersection_by_base(const std::vector ®ions_a, + const std::vector ®ions_b, + std::vector ®ions_c) { typename std::vector::const_iterator a(regions_a.begin()); typename std::vector::const_iterator a_lim(regions_a.end()); typename std::vector::const_iterator b(regions_b.begin()); @@ -364,36 +355,39 @@ genomic_region_intersection_by_base(const std::vector ®ions_a, regions_c.push_back(T(a->get_chrom(), std::max(a->get_start(), b->get_start()), std::min(a->get_end(), b->get_end()))); - if (a == b) {++a; ++b;} - else if (a->less1(*b)) ++a; - else ++b; // if (*b < *a) + if (a == b) { + ++a; + ++b; + } + else if (a->less1(*b)) + ++a; + else + ++b; // if (*b < *a) } } -void -ReadBEDFile(const std::string &filename, - std::vector ®ions); +void ReadBEDFile(const std::string &filename, + std::vector ®ions); -void -ReadBEDFile(const std::string &filename, - std::vector ®ions); +void ReadBEDFile(const std::string &filename, + std::vector ®ions); -template void -WriteBEDFile(const std::string filename, - const std::vector > ®ions, - std::string track_name = "") { +template +void WriteBEDFile(const std::string filename, + const std::vector> ®ions, + std::string track_name = "") { std::ofstream out(filename.c_str()); if (track_name.length() > 0) out << "track name=" << track_name << std::endl; - for (typename std::vector >::const_iterator i = - regions.begin(); i != regions.end(); ++i) + for (typename std::vector>::const_iterator i = regions.begin(); + i != regions.end(); ++i) std::copy(i->begin(), i->end(), std::ostream_iterator(out, "\n")); out.close(); } -template void -WriteBEDFile(const std::string filename, - const std::vector ®ions, std::string track_name = "") { +template +void WriteBEDFile(const std::string filename, const std::vector ®ions, + std::string track_name = "") { std::ofstream out(filename.c_str()); if (track_name.length() > 0) out << "track name=" << track_name << std::endl; @@ -402,16 +396,13 @@ WriteBEDFile(const std::string filename, out.close(); } -template -std::string -assemble_region_name(const T ®ion) { +template std::string assemble_region_name(const T ®ion) { return (region.get_chrom() + ":" + smithlab::toa(region.get_start()) + "-" + smithlab::toa(region.get_end())); } template -std::string -assemble_region_name(const T ®ion, const std::string sep) { +std::string assemble_region_name(const T ®ion, const std::string sep) { return (region.get_chrom() + sep + smithlab::toa(region.get_start()) + sep + smithlab::toa(region.get_end())); } diff --git a/MappedRead.cpp b/MappedRead.cpp index 0d23ca8..a242486 100644 --- a/MappedRead.cpp +++ b/MappedRead.cpp @@ -23,17 +23,17 @@ #include "MappedRead.hpp" #include "smithlab_utils.hpp" -#include #include +#include #include #include -using std::string; using std::runtime_error; +using std::string; MappedRead::MappedRead(const string &line) { std::istringstream is; - is.rdbuf()->pubsetbuf(const_cast(line.c_str()), line.size()); + is.rdbuf()->pubsetbuf(const_cast(line.c_str()), line.size()); string chrom, name, tmp; size_t start = 0ul, end = 0ul; @@ -41,7 +41,7 @@ MappedRead::MappedRead(const string &line) { double score; if (is >> chrom >> start >> tmp) { if (find_if(tmp.begin(), tmp.end(), - [](char c) {return !std::isdigit(c);}) == tmp.end()) { + [](char c) { return !std::isdigit(c); }) == tmp.end()) { end = std::stol(tmp); if (!(is >> name >> score >> strand >> seq)) throw runtime_error("bad line in MappedRead file: " + line); @@ -55,11 +55,11 @@ MappedRead::MappedRead(const string &line) { r = GenomicRegion(chrom, start, end, name, score, strand); is >> scr; } - else throw runtime_error("bad line in MappedRead file: " + line); + else + throw runtime_error("bad line in MappedRead file: " + line); } -string -MappedRead::tostring() const { +string MappedRead::tostring() const { std::ostringstream oss; oss << r; // no chaining for the << of GenomicRegion oss << '\t' << seq; diff --git a/MappedRead.hpp b/MappedRead.hpp index 0299086..e624418 100644 --- a/MappedRead.hpp +++ b/MappedRead.hpp @@ -34,8 +34,7 @@ struct MappedRead { std::string tostring() const; }; -template T& -operator>>(T &the_stream, MappedRead &mr) { +template T &operator>>(T &the_stream, MappedRead &mr) { std::string buffer; if (getline(the_stream, buffer)) { mr = MappedRead(buffer); @@ -43,8 +42,7 @@ operator>>(T &the_stream, MappedRead &mr) { return the_stream; } -template T& -operator<<(T &the_stream, const MappedRead &mr) { +template T &operator<<(T &the_stream, const MappedRead &mr) { the_stream << mr.tostring(); return the_stream; } diff --git a/OptionParser.cpp b/OptionParser.cpp index 9585ba6..436f67a 100644 --- a/OptionParser.cpp +++ b/OptionParser.cpp @@ -23,36 +23,41 @@ #include "OptionParser.hpp" -#include -#include -#include #include -#include -#include -#include #include +#include +#include +#include +#include #include +#include #include +#include #include "smithlab_utils.hpp" -using std::vector; -using std::string; -using std::endl; -using std::runtime_error; using std::begin; using std::end; +using std::endl; +using std::runtime_error; +using std::string; +using std::vector; static const size_t MAX_LINE_LENGTH = 72; enum { - SMITHLAB_ARG_INT, SMITHLAB_ARG_UINT, SMITHLAB_ARG_LONG, - SMITHLAB_ARG_ULONG, SMITHLAB_ARG_FLOAT, SMITHLAB_ARG_DOUBLE, - SMITHLAB_ARG_STRING, SMITHLAB_ARG_BOOL, SMITHLAB_ARG_CHAR + SMITHLAB_ARG_INT, + SMITHLAB_ARG_UINT, + SMITHLAB_ARG_LONG, + SMITHLAB_ARG_ULONG, + SMITHLAB_ARG_FLOAT, + SMITHLAB_ARG_DOUBLE, + SMITHLAB_ARG_STRING, + SMITHLAB_ARG_BOOL, + SMITHLAB_ARG_CHAR }; -void -Option::format_option(const string &argument) { +void Option::format_option(const string &argument) { std::istringstream ss(argument); if ((arg_type == SMITHLAB_ARG_INT && !(ss >> *int_value)) || (arg_type == SMITHLAB_ARG_UINT && !(ss >> *uint_value)) || @@ -61,8 +66,8 @@ Option::format_option(const string &argument) { (arg_type == SMITHLAB_ARG_FLOAT && !(ss >> *float_value)) || (arg_type == SMITHLAB_ARG_DOUBLE && !(ss >> *double_value)) || (arg_type == SMITHLAB_ARG_CHAR && !(ss >> *char_value))) - throw runtime_error("Invalid argument [" + argument + - "] to option [" + format_option_name() + "]"); + throw runtime_error("Invalid argument [" + argument + "] to option [" + + format_option_name() + "]"); else if (arg_type == SMITHLAB_ARG_STRING) *string_value = argument; else if (arg_type == SMITHLAB_ARG_BOOL) { @@ -77,32 +82,37 @@ Option::format_option(const string &argument) { using std::numeric_limits; using std::to_string; -template string -format_int_like(T &val) { +template string format_int_like(T &val) { return "[" + - ((val == numeric_limits::max()) ? "infty" : - ((val == -numeric_limits::max()) ? "-infty" : to_string(val))) + "]"; + ((val == numeric_limits::max()) + ? "infty" + : ((val == -numeric_limits::max()) ? "-infty" + : to_string(val))) + + "]"; } -template string -format_unsigned_like(T &val) { - return "[" + - ((val == numeric_limits::max()) ? "infty" : to_string(val)) + "]"; +template string format_unsigned_like(T &val) { + return "[" + ((val == numeric_limits::max()) ? "infty" : to_string(val)) + + "]"; } -template string -format_float_like(T &val) { +template string format_float_like(T &val) { return "[" + - ((val == numeric_limits::max()) ? "infty" : - ((val == -numeric_limits::max()) ? "-infty" : - ((val == numeric_limits::min()) ? "eps" : - ((val == -numeric_limits::min()) ? "-eps" : - ((std::abs(val) < numeric_limits::min()) ? "0.0" : - to_string(val)))))) + "]"; + ((val == numeric_limits::max()) + ? "infty" + : ((val == -numeric_limits::max()) + ? "-infty" + : ((val == numeric_limits::min()) + ? "eps" + : ((val == -numeric_limits::min()) + ? "-eps" + : ((std::abs(val) < numeric_limits::min()) + ? "0.0" + : to_string(val)))))) + + "]"; } -string -Option::format_default_value() const { +string Option::format_default_value() const { std::istringstream ss; if (arg_type == SMITHLAB_ARG_INT) return format_int_like(*int_value); @@ -120,76 +130,76 @@ Option::format_default_value() const { return (*string_value).empty() ? "" : "[" + *string_value + "]"; else if (arg_type == SMITHLAB_ARG_CHAR) return "[" + string(1, *char_value) + "]"; - else // if (arg_type == SMITHLAB_ARG_BOOL) + else // if (arg_type == SMITHLAB_ARG_BOOL) return ""; //*bool_value ? "true" : "false"; } - //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, int &val) : - arg_type(SMITHLAB_ARG_INT), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), int_value(&val) {} + const bool reqd, int &val) + : arg_type(SMITHLAB_ARG_INT), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), int_value(&val) {} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, unsigned int &val) : - arg_type(SMITHLAB_ARG_UINT), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), uint_value(&val) {} + const bool reqd, unsigned int &val) + : arg_type(SMITHLAB_ARG_UINT), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), uint_value(&val) {} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, long &val) : - arg_type(SMITHLAB_ARG_LONG), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), long_value(&val) {} + const bool reqd, long &val) + : arg_type(SMITHLAB_ARG_LONG), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), long_value(&val) {} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, unsigned long &val) : - arg_type(SMITHLAB_ARG_ULONG), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), ulong_value(&val) {} + const bool reqd, unsigned long &val) + : arg_type(SMITHLAB_ARG_ULONG), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), ulong_value(&val) {} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, float &val) : - arg_type(SMITHLAB_ARG_FLOAT), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), float_value(&val) {} + const bool reqd, float &val) + : arg_type(SMITHLAB_ARG_FLOAT), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), float_value(&val) {} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, double &val) : - arg_type(SMITHLAB_ARG_DOUBLE), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), double_value(&val) {} + const bool reqd, double &val) + : arg_type(SMITHLAB_ARG_DOUBLE), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), double_value(&val) { +} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, string &val) : - arg_type(SMITHLAB_ARG_STRING), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), string_value(&val) {} + const bool reqd, string &val) + : arg_type(SMITHLAB_ARG_STRING), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), string_value(&val) { +} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, bool &val) : - arg_type(SMITHLAB_ARG_BOOL), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), bool_value(&val) {} + const bool reqd, bool &val) + : arg_type(SMITHLAB_ARG_BOOL), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), bool_value(&val) {} Option::Option(const string l_name, const char s_name, const string descr, - const bool reqd, char &val) : - arg_type(SMITHLAB_ARG_CHAR), long_name(l_name), short_name(s_name), - description(descr), required(reqd), specified(false), char_value(&val) {} + const bool reqd, char &val) + : arg_type(SMITHLAB_ARG_CHAR), long_name(l_name), short_name(s_name), + description(descr), required(reqd), specified(false), char_value(&val) {} //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// -string -Option::format_option_name() const { +string Option::format_option_name() const { std::ostringstream ss; if (short_name != '\0') ss << '-' << short_name << ", -" << long_name; - else ss << " -" << long_name; + else + ss << " -" << long_name; return ss.str(); } -string -Option::format_option_description(const size_t offset, - const bool show_default) const { +string Option::format_option_description(const size_t offset, + const bool show_default) const { std::ostringstream ss; if (!description.empty()) { vector parts; @@ -208,22 +218,20 @@ Option::format_option_description(const size_t offset, if (i > 0 && line_len == 0) ss << string(offset, ' '); ss << parts[i] << " "; - line_len += parts[i].size()+1; //+1 for the added space + line_len += parts[i].size() + 1; //+1 for the added space } } return ss.str(); } -bool -Option::option_match(const string &other) { +bool Option::option_match(const string &other) { return (long_name == other || (other.length() > 1 && other[0] == '-' && (other.substr(1) == long_name || (other[1] == short_name && other.length() == 2)))); } -bool -Option::parse(vector &command_line) { +bool Option::parse(vector &command_line) { static const string dummy; if (!command_line.empty()) { for (size_t i = 0; i < command_line.size();) @@ -256,8 +264,7 @@ Option::parse(vector &command_line) { return (specified || !required); } -void -Option::parse_config_file(vector &options) { +void Option::parse_config_file(vector &options) { size_t i = 0; size_t op_num = options.size(); while (i < op_num) { @@ -281,57 +288,49 @@ Option::parse_config_file(vector &options) { //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, int &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, int &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, unsigned &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, unsigned &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, long &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, long &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, unsigned long &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, + unsigned long &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, float &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, float &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, double &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, double &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, string &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, string &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, bool &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, bool &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } -void -OptionParser::add_opt(const string l_name, const char s_name, const string descr, - const bool reqd, char &val) { +void OptionParser::add_opt(const string l_name, const char s_name, + const string descr, const bool reqd, char &val) { options.push_back(Option(l_name, s_name, descr, reqd, val)); } @@ -340,12 +339,9 @@ OptionParser::add_opt(const string l_name, const char s_name, const string descr //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// -bool valid_option_char(char ch) { - return std::isalnum(ch) || ch == '_'; -} +bool valid_option_char(char ch) { return std::isalnum(ch) || ch == '_'; } -static void -fix_whitespace(string &s) { +static void fix_whitespace(string &s) { std::istringstream iss(s); string token; s.clear(); @@ -356,9 +352,8 @@ fix_whitespace(string &s) { } } -static void -read_config_file(const string &config_filename, - vector &config_file_options) { +static void read_config_file(const string &config_filename, + vector &config_file_options) { static const char comment_character = '#'; static const char separator_character = ':'; @@ -381,7 +376,7 @@ read_config_file(const string &config_filename, const size_t sep_pos = line.find_first_of(separator_character); - if (sep_pos == 0 || // catches ": " + if (sep_pos == 0 || // catches ": " sep_pos >= line.length() - 1) // catches no sep or final char sep throw runtime_error("bad config file line: " + line); @@ -403,10 +398,9 @@ read_config_file(const string &config_filename, } } -vector -OptionParser::parse(const int argc, const char **argv) { +vector OptionParser::parse(const int argc, const char **argv) { // The "2" below corresponds to the "about" and "help" options - assert(options.size() >= 2); + assert(options.size() >= 2); vector arguments; @@ -422,7 +416,7 @@ OptionParser::parse(const int argc, const char **argv) { vector config_file_options; string config_filename; if (i + 1 < argc - 1) - config_filename = arguments[i+1]; + config_filename = arguments[i + 1]; else // ads: need to check that this is really a filename throw runtime_error("--config requires config filename"); @@ -448,17 +442,15 @@ OptionParser::parse(const int argc, const char **argv) { return arguments; } -void -OptionParser::parse(const int argc, const char **argv, - vector &arguments) { +void OptionParser::parse(const int argc, const char **argv, + vector &arguments) { arguments = parse(argc, argv); } -void -OptionParser::parse(const int argc, const char **argv, - vector &arguments, string config_filename) { +void OptionParser::parse(const int argc, const char **argv, + vector &arguments, string config_filename) { // The "2" below corresponds to the "about" and "help" options - assert(options.size() >= 2); + assert(options.size() >= 2); if (!config_filename.empty()) { vector config_file_options; @@ -480,10 +472,10 @@ OptionParser::parse(const int argc, const char **argv, } OptionParser::OptionParser(const string nm, const string descr, - string noflag_msg, const size_t n_left) : - prog_name(nm), prog_descr(descr), noflag_message(noflag_msg), - help_request(false), about_request(false), - show_defaults(false), n_leftover(n_left) { + string noflag_msg, const size_t n_left) + : prog_name(nm), prog_descr(descr), noflag_message(noflag_msg), + help_request(false), about_request(false), show_defaults(false), + n_leftover(n_left) { add_opt("help", '?', "print this help message", false, help_request); add_opt("about", '\0', "print about message", false, about_request); } @@ -495,8 +487,7 @@ OptionParser::OptionParser(const string nm, const string descr, ////// FOR PRINTING MESSAGES ////// -string -OptionParser::help_message() const { +string OptionParser::help_message() const { // corresponds to the two spaces before and static const string SPACE_BEFORE_SHORT = " "; static const string SPACE_BTWN_SHRT_LNG = " "; @@ -504,7 +495,7 @@ OptionParser::help_message() const { vector option_names; size_t max_name_len = 0; - for(size_t i = 0; i < options.size(); ++i) { + for (size_t i = 0; i < options.size(); ++i) { option_names.push_back(options[i].format_option_name()); max_name_len = std::max(max_name_len, option_names.back().length()); } @@ -522,23 +513,22 @@ OptionParser::help_message() const { for (size_t i = 2; i < options.size(); ++i) ss << SPACE_BEFORE_SHORT << std::left << std::setw(max_name_len) << option_names[i] << SPACE_BTWN_SHRT_LNG - << options[i].format_option_description(max_name_len + - TOTAL_ADDED_SPACE, - show_defaults) << endl; + << options[i].format_option_description( + max_name_len + TOTAL_ADDED_SPACE, show_defaults) + << endl; } ss << endl << "Help options:" << endl; for (size_t i = 0; i < std::min(2ul, options.size()); ++i) ss << SPACE_BEFORE_SHORT << std::left << std::setw(max_name_len) << option_names[i] << SPACE_BTWN_SHRT_LNG - << options[i].format_option_description(max_name_len + - TOTAL_ADDED_SPACE, - show_defaults) << endl; + << options[i].format_option_description(max_name_len + TOTAL_ADDED_SPACE, + show_defaults) + << endl; return ss.str(); } -string -OptionParser::about_message() const { +string OptionParser::about_message() const { static const char *PROGRAM_NAME_TAG = "PROGRAM: "; vector parts; @@ -553,22 +543,17 @@ OptionParser::about_message() const { line_len = 0; ss << endl; } - else ss << ' '; + else + ss << ' '; ss << parts[i]; line_len += parts[i].length() + 1; // the "+1" is for the space } return ss.str(); } +string OptionParser::about_message_raw() const { return prog_descr; } -string -OptionParser::about_message_raw() const { - return prog_descr; -} - - -string -OptionParser::invalid_leftover() const { +string OptionParser::invalid_leftover() const { static const string left_tag("invalid leftover args [should be "); static const string right_tag("]"); @@ -577,15 +562,12 @@ OptionParser::invalid_leftover() const { ss << left_tag << n_leftover << right_tag << endl; } for (size_t i = 0; i < leftover_args.size(); ++i) { - ss << "leftover arg #" << (i + 1) << "=\"" - << leftover_args[i] << "\""; + ss << "leftover arg #" << (i + 1) << "=\"" << leftover_args[i] << "\""; } return ss.str(); } - -string -OptionParser::option_missing_message() const { +string OptionParser::option_missing_message() const { std::ostringstream ss; ss << "required argument missing: [" << first_missing_option_name << "]"; return ss.str(); diff --git a/OptionParser.hpp b/OptionParser.hpp index 616bce4..f67c15d 100644 --- a/OptionParser.hpp +++ b/OptionParser.hpp @@ -19,30 +19,30 @@ #ifndef OPTION_PARSER_HPP #define OPTION_PARSER_HPP +#include #include #include -#include class Option { public: - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, int &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, unsigned &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, long &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, unsigned long &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, float &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, double &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, std::string &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, bool &val); - Option(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, char &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, int &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, unsigned &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, long &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, unsigned long &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, float &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, double &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, std::string &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, bool &val); + Option(const std::string l_name, const char s_name, const std::string descr, + const bool reqd, char &val); bool parse(std::vector &command_line); @@ -54,7 +54,6 @@ class Option { std::string format_default_value() const; private: - unsigned arg_type; std::string long_name; char short_name; @@ -81,57 +80,52 @@ class Option { class OptionParser { public: - OptionParser(const std::string nm, const std::string descr, std::string noflag_msg = "", const size_t n_left = std::numeric_limits::max()); - void set_show_defaults() {show_defaults = true;} + void set_show_defaults() { show_defaults = true; } void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, int &val); + const std::string descr, const bool reqd, int &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, unsigned &val); + const std::string descr, const bool reqd, unsigned &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, long &val); + const std::string descr, const bool reqd, long &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, unsigned long &val); + const std::string descr, const bool reqd, unsigned long &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, float &val); + const std::string descr, const bool reqd, float &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, double &val); + const std::string descr, const bool reqd, double &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, std::string &val); + const std::string descr, const bool reqd, std::string &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, bool &val); + const std::string descr, const bool reqd, bool &val); void add_opt(const std::string l_name, const char s_name, - const std::string descr, const bool reqd, char &val); + const std::string descr, const bool reqd, char &val); void parse(const int argc, const char **argv, std::vector &arguments); - std::vector - parse(const int argc, const char **argv); + std::vector parse(const int argc, const char **argv); void parse(const int argc, const char **argv, - std::vector &arguments, - std::string config_filename); + std::vector &arguments, std::string config_filename); - bool help_requested() const {return help_request;} + bool help_requested() const { return help_request; } std::string help_message() const; - bool about_requested() const {return about_request;} + bool about_requested() const { return about_request; } std::string about_message() const; std::string about_message_raw() const; std::string invalid_leftover() const; - bool option_missing() const { - return !first_missing_option_name.empty(); - } + bool option_missing() const { return !first_missing_option_name.empty(); } bool wrong_number_leftover() const { return n_leftover != std::numeric_limits::max() && - leftover_args.size() != n_leftover; + leftover_args.size() != n_leftover; } std::string option_missing_message() const; diff --git a/QualityScore.cpp b/QualityScore.cpp index 0e29c17..a236bfb 100644 --- a/QualityScore.cpp +++ b/QualityScore.cpp @@ -22,21 +22,19 @@ #include "QualityScore.hpp" -#include #include "smithlab_utils.hpp" +#include -using std::string; using std::runtime_error; +using std::string; -static bool -check_formats(char c, bool &solexa, bool &phred) { +static bool check_formats(char c, bool &solexa, bool &phred) { solexa = solexa && valid_solexa_score(c); phred = phred && valid_phred_score(c); return (solexa && phred); } -FASTQScoreType -fastq_score_type(const string filename) { +FASTQScoreType fastq_score_type(const string filename) { static const size_t MAX_LINE_SIZE = 1000; std::ifstream f(filename.c_str()); if (!f) @@ -48,7 +46,8 @@ fastq_score_type(const string filename) { while (f.getline(line, MAX_LINE_SIZE)) { if (line_count % 4 == 3) { char *c = line; - while (*c != '\0' && check_formats(*c, solexa, phred)) ++c; + while (*c != '\0' && check_formats(*c, solexa, phred)) + ++c; if (!check_formats(*c, solexa, phred)) return ((phred) ? FASTQ_Phred : FASTQ_Solexa); } @@ -57,8 +56,7 @@ fastq_score_type(const string filename) { return (phred) ? FASTQ_Phred : FASTQ_Solexa; } -FASTQScoreType -mapped_reads_score_type(const string filename) { +FASTQScoreType mapped_reads_score_type(const string filename) { static const size_t MAX_LINE_SIZE = 10000; std::ifstream f(filename.c_str()); if (!f) @@ -84,7 +82,8 @@ mapped_reads_score_type(const string filename) { while (position < MAX_LINE_SIZE && isspace(line[position])) ++position; char *c = line + position; - while (!isspace(*c) && check_formats(*c, solexa, phred)) ++c; + while (!isspace(*c) && check_formats(*c, solexa, phred)) + ++c; if (!check_formats(*c, solexa, phred)) return ((phred) ? FASTQ_Phred : FASTQ_Solexa); } diff --git a/QualityScore.hpp b/QualityScore.hpp index 5c66d76..1cf1010 100644 --- a/QualityScore.hpp +++ b/QualityScore.hpp @@ -23,8 +23,8 @@ #ifndef QUALITY_SCORE_HPP #define QUALITY_SCORE_HPP -#include #include +#include #include //////////////////////////////////////////////////////////////////////// @@ -37,137 +37,99 @@ typedef size_t FASTQScoreType; static const FASTQScoreType FASTQ_Solexa = 0; static const FASTQScoreType FASTQ_Phred = 1; -inline bool -FASTQScoreIsSolexa(const FASTQScoreType t) {return t == FASTQ_Solexa;} +inline bool FASTQScoreIsSolexa(const FASTQScoreType t) { + return t == FASTQ_Solexa; +} -inline bool -FASTQScoreIsPhred(const FASTQScoreType t) {return t == FASTQ_Phred;} +inline bool FASTQScoreIsPhred(const FASTQScoreType t) { + return t == FASTQ_Phred; +} static const double neg_ten_over_log_ten = -4.342944819032517501; //// CONVERT _FROM_ ERROR PROBABILITIES -inline double -error_probability_to_phred(const double r) { - return neg_ten_over_log_ten*std::log(r); +inline double error_probability_to_phred(const double r) { + return neg_ten_over_log_ten * std::log(r); } -inline double -error_probability_to_solexa(const double r) { - return neg_ten_over_log_ten*(std::log(r) - std::log(1.0 - r)); +inline double error_probability_to_solexa(const double r) { + return neg_ten_over_log_ten * (std::log(r) - std::log(1.0 - r)); } //// CONVERT _TO_ ERROR PROBABILITIES -inline double -phred_to_error_probability(const double r) { - const double h = r/neg_ten_over_log_ten; +inline double phred_to_error_probability(const double r) { + const double h = r / neg_ten_over_log_ten; return std::exp(h); } -inline double -solexa_to_error_probability(const double r) { - const double s = r/neg_ten_over_log_ten; - return std::exp(s)/(1.0 + std::exp(s)); +inline double solexa_to_error_probability(const double r) { + const double s = r / neg_ten_over_log_ten; + return std::exp(s) / (1.0 + std::exp(s)); } //// CONVERT _FROM_ QUALITY CHARACTERS (I.E. THE CHARACTERS IN FASTQ FILES) -inline char -quality_character_to_phred(const char c) { - return char(c - 33); -} -inline char -quality_character_to_solexa(const char c) { - return char(c - 64); -} +inline char quality_character_to_phred(const char c) { return char(c - 33); } +inline char quality_character_to_solexa(const char c) { return char(c - 64); } //// CONVERT _TO_ QUALITY CHARACTERS (I.E. THE CHARACTERS IN FASTQ FILES) -inline char -phred_to_quality_character(const double h) { +inline char phred_to_quality_character(const double h) { return char(std::min(60.0, h) + 33); } -inline char -solexa_to_quality_character(const double s) { +inline char solexa_to_quality_character(const double s) { return char(std::min(40.0, s) + 64); } //// CHECK FOR VALIDITY OF A FASTQ SCORE CHARACTER -inline bool -valid_phred_score(const char c) { - return (c >= 33 && c <= 93); -} -inline bool -valid_solexa_score(const char c) { +inline bool valid_phred_score(const char c) { return (c >= 33 && c <= 93); } +inline bool valid_solexa_score(const char c) { // return (c >= 64 && c <= 104); return (c >= 59 && c <= 104); // to allow for old Solexa format } -inline double -quality_char_to_error_probability(const FASTQScoreType t, - const char c) { - return (t == FASTQ_Solexa) ? - solexa_to_error_probability(quality_character_to_solexa(c)) : - phred_to_error_probability(quality_character_to_phred(c)); +inline double quality_char_to_error_probability(const FASTQScoreType t, + const char c) { + return (t == FASTQ_Solexa) + ? solexa_to_error_probability(quality_character_to_solexa(c)) + : phred_to_error_probability(quality_character_to_phred(c)); } -inline double -char2prob_solexa(const char c) -{ - return 1 - solexa_to_error_probability(quality_character_to_solexa(c)); +inline double char2prob_solexa(const char c) { + return 1 - solexa_to_error_probability(quality_character_to_solexa(c)); } -inline double -char2prob_phred(const char c) -{ - return 1 - phred_to_error_probability(quality_character_to_phred(c)); +inline double char2prob_phred(const char c) { + return 1 - phred_to_error_probability(quality_character_to_phred(c)); } -inline double -char2err_solexa(const char c) -{ - return solexa_to_error_probability(quality_character_to_solexa(c)); +inline double char2err_solexa(const char c) { + return solexa_to_error_probability(quality_character_to_solexa(c)); } -inline double -char2err_phred(const char c) -{ - return phred_to_error_probability(quality_character_to_phred(c)); +inline double char2err_phred(const char c) { + return phred_to_error_probability(quality_character_to_phred(c)); } -inline char -prob2char_solexa(const double prob) -{ - return solexa_to_quality_character( - error_probability_to_solexa(1 - prob)); +inline char prob2char_solexa(const double prob) { + return solexa_to_quality_character(error_probability_to_solexa(1 - prob)); } -inline char -prob2char_phred(const double prob) -{ - return phred_to_quality_character( - error_probability_to_phred(1 - prob)); +inline char prob2char_phred(const double prob) { + return phred_to_quality_character(error_probability_to_phred(1 - prob)); } -inline char -err2char_solexa(const double err) -{ - return solexa_to_quality_character( - error_probability_to_solexa(err)); +inline char err2char_solexa(const double err) { + return solexa_to_quality_character(error_probability_to_solexa(err)); } -inline char -err2char_phred(const double err) -{ - return phred_to_quality_character( - error_probability_to_phred(err)); +inline char err2char_phred(const double err) { + return phred_to_quality_character(error_probability_to_phred(err)); } -inline double -quality_score_to_error_probability(const FASTQScoreType t, - const double s) { - return (FASTQScoreIsSolexa(t)) ? - solexa_to_error_probability(s) : phred_to_error_probability(s); +inline double quality_score_to_error_probability(const FASTQScoreType t, + const double s) { + return (FASTQScoreIsSolexa(t)) ? solexa_to_error_probability(s) + : phred_to_error_probability(s); } -FASTQScoreType -fastq_score_type(const std::string filename); -FASTQScoreType -mapped_reads_score_type(const std::string filename); +FASTQScoreType fastq_score_type(const std::string filename); +FASTQScoreType mapped_reads_score_type(const std::string filename); #endif diff --git a/bisulfite_utils.cpp b/bisulfite_utils.cpp index 05a30c5..b43d9b4 100644 --- a/bisulfite_utils.cpp +++ b/bisulfite_utils.cpp @@ -28,11 +28,10 @@ using std::string; -void -bisulfite_treatment(std::mt19937 &generator, string &seq, - double bs_rate, double meth_rate) { +void bisulfite_treatment(std::mt19937 &generator, string &seq, double bs_rate, + double meth_rate) { - std::uniform_real_distribution unif(0.0,1.0); + std::uniform_real_distribution unif(0.0, 1.0); const size_t seq_len = seq.length() - 1; for (size_t i = 0; i < seq_len; ++i) diff --git a/bisulfite_utils.hpp b/bisulfite_utils.hpp index cdb1816..5721a9b 100644 --- a/bisulfite_utils.hpp +++ b/bisulfite_utils.hpp @@ -16,45 +16,39 @@ * General Public License for more details. */ - #ifndef BISULFITE_UTILS_HPP #define BISULFITE_UTILS_HPP -#include -#include #include +#include +#include #include "sam_record.hpp" namespace bsflags { - // ADS: this is our addition to the SAM flags, using the "free" bits - /* 4096 0x1000 read is A-rich - */ - static const uint16_t read_is_a_rich = 0x1000; -} +// ADS: this is our addition to the SAM flags, using the "free" bits +/* 4096 0x1000 read is A-rich + */ +static const uint16_t read_is_a_rich = 0x1000; +} // namespace bsflags -inline bool -is_t_rich(const sam_rec &sr) { +inline bool is_t_rich(const sam_rec &sr) { return !samflags::check(sr.flags, bsflags::read_is_a_rich); } -inline bool -is_a_rich(const sam_rec &sr) { +inline bool is_a_rich(const sam_rec &sr) { return samflags::check(sr.flags, bsflags::read_is_a_rich); } -inline void -set_t_rich(sam_rec &sr) { +inline void set_t_rich(sam_rec &sr) { samflags::unset(sr.flags, bsflags::read_is_a_rich); } -inline void -set_a_rich(sam_rec &sr) { +inline void set_a_rich(sam_rec &sr) { samflags::set(sr.flags, bsflags::read_is_a_rich); } -void -bisulfite_treatment(std::mt19937 &generator, std::string &seq, - double bs_rate = 1.0, double meth_rate = 0.0); +void bisulfite_treatment(std::mt19937 &generator, std::string &seq, + double bs_rate = 1.0, double meth_rate = 0.0); #endif diff --git a/chromosome_utils.cpp b/chromosome_utils.cpp index 20ae978..e4d82f4 100644 --- a/chromosome_utils.cpp +++ b/chromosome_utils.cpp @@ -22,19 +22,19 @@ #include "chromosome_utils.hpp" -#include #include -#include +#include #include #include -#include +#include +#include -using std::vector; -using std::string; -using std::unordered_map; using std::runtime_error; using std::size; +using std::string; using std::toupper; +using std::unordered_map; +using std::vector; static const char *digits = "0987654321"; static const char *whitespace = " \t"; @@ -42,9 +42,8 @@ static const char *whitespace = " \t"; // define what to do if parts are missing: if no ':' or '-', is whole // thing chrom? -void -parse_region_name(string region_name, - string& chrom, size_t &start, size_t &end) { +void parse_region_name(string region_name, string &chrom, size_t &start, + size_t &end) { const size_t colon_offset = region_name.find(":"); @@ -58,48 +57,47 @@ parse_region_name(string region_name, // get the start const size_t start_end = region_name.find("-", colon_offset + 1); - const string start_string = region_name.substr(colon_offset + 1, - start_end - colon_offset + 1); + const string start_string = + region_name.substr(colon_offset + 1, start_end - colon_offset + 1); start = static_cast(atoi(start_string.c_str())); // get the end - const size_t end_end = - region_name.find_first_not_of(digits, start_end + 1); - const string end_string = region_name.substr(start_end + 1, - end_end - start_end - 1); + const size_t end_end = region_name.find_first_not_of(digits, start_end + 1); + const string end_string = + region_name.substr(start_end + 1, end_end - start_end - 1); end = static_cast(atoi(end_string.c_str())); } -static size_t -adjust_start_pos(const size_t orig_start, const string &chrom_name) { - static constexpr double line_width = 50.0; // ADS: dangerous; often - // this is 80 +static size_t adjust_start_pos(const size_t orig_start, + const string &chrom_name) { + static constexpr double line_width = 50.0; // ADS: dangerous; often + // this is 80 const size_t name_offset = chrom_name.length() + 2; // For the '>' and '\n'; const size_t preceding_newlines = - static_cast(std::floor(orig_start / line_width)); + static_cast(std::floor(orig_start / line_width)); return orig_start + preceding_newlines + name_offset; } -static size_t -adjust_region_size(const size_t orig_start, const size_t orig_size) { - static constexpr double line_width = 50.0; // ADS: dangerous; often - // this is 80 +static size_t adjust_region_size(const size_t orig_start, + const size_t orig_size) { + static constexpr double line_width = 50.0; // ADS: dangerous; often + // this is 80 const size_t preceding_newlines_start = - static_cast(std::floor(orig_start / line_width)); + static_cast(std::floor(orig_start / line_width)); const size_t preceding_newlines_end = - static_cast(std::floor((orig_start + orig_size) / line_width)); + static_cast(std::floor((orig_start + orig_size) / line_width)); return (orig_size + (preceding_newlines_end - preceding_newlines_start)); } template -void -extract_regions_chrom_fasta_impl(const string &chrom_name, - const string &filename, - const vector ®ions, - vector &sequences) { +void extract_regions_chrom_fasta_impl(const string &chrom_name, + const string &filename, + const vector ®ions, + vector &sequences) { std::ifstream in(filename); - if (!in) throw runtime_error("failed to open file: " + filename); + if (!in) + throw runtime_error("failed to open file: " + filename); for (auto &i : regions) { const auto orig_start_pos = i.get_start(); @@ -107,7 +105,8 @@ extract_regions_chrom_fasta_impl(const string &chrom_name, const auto orig_region_size = orig_end_pos - orig_start_pos; const auto start_pos = adjust_start_pos(orig_start_pos, chrom_name); - const auto region_size = adjust_region_size(orig_start_pos, orig_region_size); + const auto region_size = + adjust_region_size(orig_start_pos, orig_region_size); assert(start_pos >= 0); in.seekg(start_pos); @@ -116,32 +115,29 @@ extract_regions_chrom_fasta_impl(const string &chrom_name, buffer.erase(remove(begin(buffer), end(buffer), '\n')); transform(cbegin(buffer), cend(buffer), begin(buffer), - [](const char x) {return toupper(x);}); + [](const char x) { return toupper(x); }); sequences.push_back(std::move(buffer)); assert(i.get_width() == size(sequences.back())); } } -void -extract_regions_chrom_fasta(const string &chrom_name, - const string &filename, - const vector ®ions, - vector &sequences) { +void extract_regions_chrom_fasta(const string &chrom_name, + const string &filename, + const vector ®ions, + vector &sequences) { extract_regions_chrom_fasta_impl(chrom_name, filename, regions, sequences); } -void -extract_regions_chrom_fasta(const string &chrom_name, - const string &filename, - const vector ®ions, - vector &sequences) { +void extract_regions_chrom_fasta(const string &chrom_name, + const string &filename, + const vector ®ions, + vector &sequences) { extract_regions_chrom_fasta_impl(chrom_name, filename, regions, sequences); } -void -extract_regions_fasta(const string &dirname, - const vector ®ions_in, - vector &sequences) { +void extract_regions_fasta(const string &dirname, + const vector ®ions_in, + vector &sequences) { static const string FASTA_SUFFIX(".fa"); assert(check_sorted(regions_in)); @@ -149,7 +145,7 @@ extract_regions_fasta(const string &dirname, vector filenames; read_dir(dirname, filenames); - vector > regions; + vector> regions; separate_chromosomes(regions_in, regions); std::unordered_map chrom_regions_map; @@ -161,17 +157,17 @@ extract_regions_fasta(const string &dirname, const string chrom_name(regions[i].front().get_chrom()); const string chrom_file(chrom_name + FASTA_SUFFIX); std::unordered_map::const_iterator f_idx = - chrom_regions_map.find(chrom_file); + chrom_regions_map.find(chrom_file); if (f_idx == chrom_regions_map.end()) throw runtime_error("chrom not found:\t" + chrom_file); - extract_regions_chrom_fasta( - chrom_name, filenames[f_idx->second], regions[i], sequences); + extract_regions_chrom_fasta(chrom_name, filenames[f_idx->second], + regions[i], sequences); } } - void extract_regions_fasta(const string &dirname, - const vector ®ions_in, vector &sequences) { + const vector ®ions_in, + vector &sequences) { static const string FASTA_SUFFIX(".fa"); assert(check_sorted(regions_in)); @@ -179,7 +175,7 @@ void extract_regions_fasta(const string &dirname, vector filenames; read_dir(dirname, filenames); - vector > regions; + vector> regions; separate_chromosomes(regions_in, regions); std::unordered_map chrom_regions_map; @@ -191,32 +187,29 @@ void extract_regions_fasta(const string &dirname, const string chrom_name(regions[i].front().get_chrom()); const string chrom_file(chrom_name + FASTA_SUFFIX); std::unordered_map::const_iterator f_idx = - chrom_regions_map.find(chrom_file); + chrom_regions_map.find(chrom_file); if (f_idx == chrom_regions_map.end()) throw runtime_error("chrom not found:\t" + chrom_file); - extract_regions_chrom_fasta( - chrom_name, filenames[f_idx->second], regions[i], sequences); + extract_regions_chrom_fasta(chrom_name, filenames[f_idx->second], + regions[i], sequences); } } - -void -identify_chromosomes(const string chrom_file, const string fasta_suffix, - unordered_map &chrom_files) { +void identify_chromosomes(const string chrom_file, const string fasta_suffix, + unordered_map &chrom_files) { vector the_files; if (isdir(chrom_file.c_str())) { read_dir(chrom_file, fasta_suffix, the_files); for (size_t i = 0; i < the_files.size(); ++i) chrom_files[strip_path_and_suffix(the_files[i])] = the_files[i]; } - else chrom_files[strip_path_and_suffix(chrom_file)] = chrom_file; + else + chrom_files[strip_path_and_suffix(chrom_file)] = chrom_file; } - -void -identify_and_read_chromosomes(const string chrom_file, - const string fasta_suffix, - unordered_map &chrom_files) { +void identify_and_read_chromosomes(const string chrom_file, + const string fasta_suffix, + unordered_map &chrom_files) { vector the_files; if (isdir(chrom_file.c_str())) { read_dir(chrom_file, fasta_suffix, the_files); diff --git a/chromosome_utils.hpp b/chromosome_utils.hpp index 4c3ed22..0e79c90 100644 --- a/chromosome_utils.hpp +++ b/chromosome_utils.hpp @@ -23,61 +23,46 @@ #ifndef CHROMOSOME_UTILS_HPP #define CHROMOSOME_UTILS_HPP -#include -#include #include -#include -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include "GenomicRegion.hpp" -void -parse_region_name(std::string region_name, - std::string &chrom, size_t &start, size_t &end); - - -void -extract_regions_chrom_fasta(const std::string &chrom_name, - const std::string &filename, - const std::vector ®ions, - std::vector &sequences); - - -void -extract_regions_chrom_fasta(const std::string &chrom_name, - const std::string &filename, - const std::vector ®ions, - std::vector &sequences); - - -void -extract_regions_fasta(const std::string &dirname, - const std::vector ®ions_in, - std::vector &sequences); +void parse_region_name(std::string region_name, std::string &chrom, + size_t &start, size_t &end); +void extract_regions_chrom_fasta(const std::string &chrom_name, + const std::string &filename, + const std::vector ®ions, + std::vector &sequences); -void -extract_regions_fasta(const std::string &dirname, - const std::vector ®ions_in, - std::vector &sequences); +void extract_regions_chrom_fasta( + const std::string &chrom_name, const std::string &filename, + const std::vector ®ions, + std::vector &sequences); +void extract_regions_fasta(const std::string &dirname, + const std::vector ®ions_in, + std::vector &sequences); -void -identify_chromosomes(const std::string chrom_file, - const std::string fasta_suffix, - std::unordered_map &chrom_files); +void extract_regions_fasta(const std::string &dirname, + const std::vector ®ions_in, + std::vector &sequences); +void identify_chromosomes( + const std::string chrom_file, const std::string fasta_suffix, + std::unordered_map &chrom_files); -void -identify_and_read_chromosomes(const std::string chrom_file, - const std::string fasta_suffix, - std::unordered_map &chrom_files); +void identify_and_read_chromosomes( + const std::string chrom_file, const std::string fasta_suffix, + std::unordered_map &chrom_files); #endif diff --git a/cigar_utils.cpp b/cigar_utils.cpp index 3e4b66a..43e6d73 100644 --- a/cigar_utils.cpp +++ b/cigar_utils.cpp @@ -19,13 +19,12 @@ #include #include +using std::runtime_error; using std::string; using std::to_string; -using std::runtime_error; -void -apply_cigar(const string &cigar, string &to_inflate, - const char inflation_symbol) { +void apply_cigar(const string &cigar, string &to_inflate, + const char inflation_symbol) { std::istringstream iss(cigar); string inflated_seq; @@ -51,9 +50,8 @@ apply_cigar(const string &cigar, string &to_inflate, // sum of total M/I/S/=/X/N operations must equal length of seq const size_t orig_len = to_inflate.length(); if (i != orig_len) - throw runtime_error("inconsistent number of qseq ops in cigar: " + - to_inflate + " " + cigar + " " + - to_string(i) + " " + - to_string(orig_len)); + throw runtime_error( + "inconsistent number of qseq ops in cigar: " + to_inflate + " " + + cigar + " " + to_string(i) + " " + to_string(orig_len)); to_inflate.swap(inflated_seq); } diff --git a/cigar_utils.hpp b/cigar_utils.hpp index fb37338..e1e8eea 100644 --- a/cigar_utils.hpp +++ b/cigar_utils.hpp @@ -20,100 +20,82 @@ #include // isdigit #include -inline bool -consumes_query(const char op) { - return ((op == 'M') || - (op == 'I') || - (op == 'S') || - (op == '=') || +inline bool consumes_query(const char op) { + return ((op == 'M') || (op == 'I') || (op == 'S') || (op == '=') || (op == 'X')); } -inline bool -consumes_reference(const char op) { - return ((op == 'M') || - (op == 'D') || - (op == 'N') || - (op == '=') || +inline bool consumes_reference(const char op) { + return ((op == 'M') || (op == 'D') || (op == 'N') || (op == '=') || (op == 'X')); } template -static size_t -extract_op_count(InItr &itr, const InItr last) { +static size_t extract_op_count(InItr &itr, const InItr last) { size_t x = 0; while (itr != last && std::isdigit(*itr)) - x = x*10 + (*itr++ - '0'); + x = x * 10 + (*itr++ - '0'); return x; } template -size_t -cigar_total_ops(InputItr itr, const InputItr last) { +size_t cigar_total_ops(InputItr itr, const InputItr last) { size_t op_count = 0; while (itr != last) { op_count += extract_op_count(itr, last); - if (itr != last) *itr++; // eat the op code + if (itr != last) + *itr++; // eat the op code } return op_count; } template -size_t -cigar_qseq_ops(InputItr itr, const InputItr last) { +size_t cigar_qseq_ops(InputItr itr, const InputItr last) { size_t op_count = 0; while (itr != last) { const size_t curr_val = extract_op_count(itr, last); - op_count += curr_val*consumes_query(*itr++); + op_count += curr_val * consumes_query(*itr++); } return op_count; } -template -size_t -cigar_qseq_ops(const T &cigar) { +template size_t cigar_qseq_ops(const T &cigar) { return cigar_qseq_ops(std::begin(cigar), std::end(cigar)); } template -size_t -cigar_rseq_ops(InputItr itr, const InputItr last) { +size_t cigar_rseq_ops(InputItr itr, const InputItr last) { size_t op_count = 0; while (itr != last) { const size_t curr_val = extract_op_count(itr, last); - op_count += curr_val*consumes_reference(*itr++); + op_count += curr_val * consumes_reference(*itr++); } return op_count; } -template constexpr -size_t -cigar_rseq_ops(const T &cigar) { +template constexpr size_t cigar_rseq_ops(const T &cigar) { return cigar_rseq_ops(std::begin(cigar), std::end(cigar)); } template -void -reverse_cigar(InputItr left, const InputItr last) { +void reverse_cigar(InputItr left, const InputItr last) { std::reverse(left, last); InputItr right(left); while (left != last) { right = std::find_if(++right, last, // ++right eats op in rev orientation - [](const char c) {return !std::isdigit(c);}); + [](const char c) { return !std::isdigit(c); }); std::reverse(left, right); left = right; } } -template -void -reverse_cigar(T &cigar) { +template void reverse_cigar(T &cigar) { reverse_cigar(std::begin(cigar), std::end(cigar)); } template -InputItr -truncate_cigar(InputItr left, const InputItr last, const size_t target_ops) { +InputItr truncate_cigar(InputItr left, const InputItr last, + const size_t target_ops) { /* truncate cigar after specified number of total operations */ InputItr right(left); size_t prev_ops = 0, curr_ops = 0; @@ -130,8 +112,8 @@ truncate_cigar(InputItr left, const InputItr last, const size_t target_ops) { } template -InputItr -truncate_cigar_q(InputItr left, const InputItr last, const size_t target_ops) { +InputItr truncate_cigar_q(InputItr left, const InputItr last, + const size_t target_ops) { /* truncate cigar after specified number of query operations */ InputItr right(left); size_t prev_ops = 0, curr_ops = 0; @@ -141,16 +123,15 @@ truncate_cigar_q(InputItr left, const InputItr last, const size_t target_ops) { prev_ops += curr_ops; curr_ops = extract_op_count(right, last); op = *right++; // cigar always ends with an op - if (!consumes_query(op)) curr_ops = 0; + if (!consumes_query(op)) + curr_ops = 0; } if (left != last && target_ops > prev_ops) left += snprintf(&(*left), 16, "%lu%c", target_ops - prev_ops, op); return left; } -template -void -truncate_cigar_q(T &cigar, const size_t target_ops) { +template void truncate_cigar_q(T &cigar, const size_t target_ops) { /* truncate cigar after specified number of query operations */ auto c_beg(std::begin(cigar)); auto updated_end = truncate_cigar_q(c_beg, std::end(cigar), target_ops); @@ -158,8 +139,8 @@ truncate_cigar_q(T &cigar, const size_t target_ops) { } template -InputItr -truncate_cigar_r(InputItr left, const InputItr last, const size_t target_ops) { +InputItr truncate_cigar_r(InputItr left, const InputItr last, + const size_t target_ops) { /* truncate cigar after specified number of reference operations */ InputItr right(left); size_t prev_ops = 0, curr_ops = 0; @@ -169,16 +150,15 @@ truncate_cigar_r(InputItr left, const InputItr last, const size_t target_ops) { prev_ops += curr_ops; curr_ops = extract_op_count(right, last); op = *right++; // cigar always ends with an op - if (!consumes_reference(op)) curr_ops = 0; + if (!consumes_reference(op)) + curr_ops = 0; } if (left != last && target_ops > prev_ops) left += snprintf(&(*left), 16, "%lu%c", target_ops - prev_ops, op); return left; } -template -void -truncate_cigar_r(T &cigar, const size_t target_ops) { +template void truncate_cigar_r(T &cigar, const size_t target_ops) { /* truncate cigar after specified number of reference operations */ auto c_beg(std::begin(cigar)); auto updated_end = truncate_cigar_r(c_beg, std::end(cigar), target_ops); @@ -186,8 +166,7 @@ truncate_cigar_r(T &cigar, const size_t target_ops) { } template -InputItr -merge_equal_neighbor_cigar_ops(InputItr rd_itr, const InputItr last) { +InputItr merge_equal_neighbor_cigar_ops(InputItr rd_itr, const InputItr last) { /* renders a cigar valid by merging consecutive identical operations */ InputItr wr_itr(rd_itr); size_t prev_op_count = extract_op_count(rd_itr, last); @@ -207,10 +186,7 @@ merge_equal_neighbor_cigar_ops(InputItr rd_itr, const InputItr last) { return wr_itr; } - -template -void -merge_equal_neighbor_cigar_ops(T &cigar) { +template void merge_equal_neighbor_cigar_ops(T &cigar) { /* renders a cigar valid by merging consecutive identical operations */ auto c_beg(std::begin(cigar)); auto updated_end = merge_equal_neighbor_cigar_ops(c_beg, std::end(cigar)); @@ -218,53 +194,45 @@ merge_equal_neighbor_cigar_ops(T &cigar) { } template -void -internal_S_to_M(InputItr rd_itr, const InputItr last) { +void internal_S_to_M(InputItr rd_itr, const InputItr last) { /* converts internal soft-clip (S) symbols into match/mismatch (M) symbols */ extract_op_count(rd_itr, last); // move past first op count - rd_itr++; // move past first op + rd_itr++; // move past first op while (rd_itr != last) { extract_op_count(rd_itr, last); // past curr op count - auto curr_op = rd_itr++; // save curr op iterator + auto curr_op = rd_itr++; // save curr op iterator if (rd_itr != last && *curr_op == 'S') { *curr_op = 'M'; } } } -template -void -internal_S_to_M(T &cigar) { +template void internal_S_to_M(T &cigar) { /* converts internal soft-clip (S) symbols into match/mismatch (M) symbols */ internal_S_to_M(std::begin(cigar), std::end(cigar)); } -template -void -terminal_S_to_M(T &cigar) { +template void terminal_S_to_M(T &cigar) { /* converts a terminal soft-clip symbol to a match/mismatch (M) symbol */ - if (cigar.back() == 'S') cigar.back() = 'M'; + if (cigar.back() == 'S') + cigar.back() = 'M'; } template -void -initial_S_to_M(InputItr rd_itr, const InputItr last) { +void initial_S_to_M(InputItr rd_itr, const InputItr last) { /* converts an initial soft-clip symbol to a match/mismatch (M) symbol */ extract_op_count(rd_itr, last); if (*rd_itr == 'S') *rd_itr = 'M'; } -template -void -initial_S_to_M(T &cigar) { +template void initial_S_to_M(T &cigar) { /* converts an initial soft-clip symbol to a match/mismatch (M) symbol */ initial_S_to_M(std::begin(cigar), std::end(cigar)); } template -size_t -get_soft_clip_size(InputItr rd_itr, const InputItr last) { +size_t get_soft_clip_size(InputItr rd_itr, const InputItr last) { /* determine the total size of the soft-clips at both ends */ size_t r = 0; size_t curr_op_count = extract_op_count(rd_itr, last); @@ -278,32 +246,26 @@ get_soft_clip_size(InputItr rd_itr, const InputItr last) { return (curr_op == 'S') ? r + curr_op_count : r; } -template constexpr -size_t -get_soft_clip_size(T &cigar) { +template constexpr size_t get_soft_clip_size(T &cigar) { /* determine the total size of the soft-clips at both ends */ return get_soft_clip_size(std::begin(cigar), std::end(cigar)); } template -size_t -get_soft_clip_size_start(InputItr rd_itr, const InputItr last) { +size_t get_soft_clip_size_start(InputItr rd_itr, const InputItr last) { /* determine the size of the soft-clip at the start of cigar */ const size_t curr_op_count = extract_op_count(rd_itr, last); return (*rd_itr == 'S') ? curr_op_count : 0; } -template constexpr -size_t -get_soft_clip_size_start(T &cigar) { +template constexpr size_t get_soft_clip_size_start(T &cigar) { /* determine the size of the soft-clip at the start of cigar */ return get_soft_clip_size_start(std::begin(cigar), std::end(cigar)); } // turns cigar symbols into compressed format template -void -compress_cigar(T1 c_itr, const T1 c_end, T2 &cigar) { +void compress_cigar(T1 c_itr, const T1 c_end, T2 &cigar) { /* convert a sequence of cigar symbols into a valid cigar string */ const size_t N = cigar.size(); unsigned j = 0, n = 0; @@ -321,8 +283,7 @@ compress_cigar(T1 c_itr, const T1 c_end, T2 &cigar) { // turns a cigar string into a string of cigar symbols template -void -uncompress_cigar(T1 c_itr, const T1 c_end, T2 &cigar) { +void uncompress_cigar(T1 c_itr, const T1 c_end, T2 &cigar) { /* convert a valid cigar string into a sequence of cigar symbols */ cigar.clear(); // ADS: should this be assumed? while (c_itr != c_end) { @@ -331,8 +292,7 @@ uncompress_cigar(T1 c_itr, const T1 c_end, T2 &cigar) { } } -void -apply_cigar(const std::string &cigar, std::string &to_inflate, - const char inflation_symbol = 'N'); +void apply_cigar(const std::string &cigar, std::string &to_inflate, + const char inflation_symbol = 'N'); #endif diff --git a/dna_four_bit.hpp b/dna_four_bit.hpp index 68dc1b6..9724dda 100644 --- a/dna_four_bit.hpp +++ b/dna_four_bit.hpp @@ -25,22 +25,18 @@ enum base_in_byte { left, right }; extern char dna_four_bit_decoding[16]; -template constexpr -uint_type -get_nibble(const uint_type x, const size_t offset) { - return (x >> (4*offset)) & 15ul; +template +constexpr uint_type get_nibble(const uint_type x, const size_t offset) { + return (x >> (4 * offset)) & 15ul; } -template constexpr -char -decode_dna_four_bit(const uint_type x, - const size_t offset) { +template +constexpr char decode_dna_four_bit(const uint_type x, const size_t offset) { return dna_four_bit_decoding[get_nibble(x, offset)]; } -template -OutputIt -decode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) { +template +OutputIt decode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) { // ADS: assume destination has enough space while (first != last) { for (size_t offset = 0; offset < 16; ++offset) @@ -52,12 +48,11 @@ decode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) { return d_first; } -template -void -decode_dna_four_bit(const InCtr &source, OutCtr &dest) { +template +void decode_dna_four_bit(const InCtr &source, OutCtr &dest) { // expand out the bytes as pairs (do this backwards in case source == dest) const size_t source_size = source.size(); - dest.resize(16*source_size); + dest.resize(16 * source_size); size_t i = source_size; size_t j = dest.size(); while (i > 0) { @@ -71,18 +66,14 @@ decode_dna_four_bit(const InCtr &source, OutCtr &dest) { } extern uint8_t dna_four_bit_encoding[128]; -template constexpr -size_t -encode_dna_four_bit(const uint_type x, - const size_t offset) { - return (static_cast( - dna_four_bit_encoding[static_cast(x)]) - ) << (4*offset); +template +constexpr size_t encode_dna_four_bit(const uint_type x, const size_t offset) { + return (static_cast(dna_four_bit_encoding[static_cast(x)])) + << (4 * offset); } -template -OutputIt -encode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) { +template +OutputIt encode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) { while (first != last) { *d_first = 0; for (size_t i = 0; i < 16 && first != last; ++i) @@ -96,12 +87,11 @@ encode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) { // of size_t values struct genome_four_bit_itr { genome_four_bit_itr(const std::vector::const_iterator itr_, - const int off_ = 0) : itr(itr_), offset(off_) {} + const int off_ = 0) + : itr(itr_), offset(off_) {} - size_t operator*() const { - return (*itr >> (offset << 2)) & 15ul; - } - genome_four_bit_itr& operator++() { + size_t operator*() const { return (*itr >> (offset << 2)) & 15ul; } + genome_four_bit_itr &operator++() { offset = (offset + 1) & 15ul; itr += (offset == 0); return *this; @@ -112,7 +102,7 @@ struct genome_four_bit_itr { itr += (offset == 0); return tmp; } - genome_four_bit_itr& operator--() { + genome_four_bit_itr &operator--() { itr -= (offset == 0); offset = (offset - 1) & 15ul; @@ -129,7 +119,7 @@ struct genome_four_bit_itr { genome_four_bit_itr operator+(const size_t step) const { // whether the sum of offsets is >= 16 const bool shift_one_pos = - ((offset + (static_cast(step) & 15)) & 16) >> 4; + ((offset + (static_cast(step) & 15)) & 16) >> 4; const int new_offset = (offset + step) & 15; return genome_four_bit_itr(itr + step / 16 + shift_one_pos, new_offset); diff --git a/htslib_wrapper.cpp b/htslib_wrapper.cpp index edf2b38..938a1f3 100644 --- a/htslib_wrapper.cpp +++ b/htslib_wrapper.cpp @@ -32,10 +32,7 @@ using std::runtime_error; using std::string; using std::vector; -char -check_htslib_wrapper() { - return 1; -} +char check_htslib_wrapper() { return 1; } SAMReader::SAMReader(const string &fn) : filename(fn), good(true), hts(nullptr), hdr(nullptr), b(nullptr) { @@ -68,8 +65,7 @@ SAMReader::~SAMReader() { good = false; } -SAMReader & -operator>>(SAMReader &reader, sam_rec &aln) { +SAMReader &operator>>(SAMReader &reader, sam_rec &aln) { reader.get_sam_record(aln); return reader; } @@ -78,8 +74,7 @@ operator>>(SAMReader &reader, sam_rec &aln) { //// general facility for SAM format ///////////////////////////////////////////// -bool -SAMReader::get_sam_record(sam_rec &sr) { +bool SAMReader::get_sam_record(sam_rec &sr) { int rd_ret = 0; if ((rd_ret = sam_read1(hts, hdr, b)) >= 0) { // ADS: the line below implicitly converts the 0-based leftmost @@ -90,7 +85,7 @@ SAMReader::get_sam_record(sam_rec &sr) { if ((fmt_ret = sam_format1(hdr, b, &hts->line)) <= 0) throw runtime_error("failed reading record from: " + filename); sr = sam_rec(hts->line.s); - good = true; // reader.get_sam_record(reader.hts->line.s, sr); + good = true; // reader.get_sam_record(reader.hts->line.s, sr); // ADS: line below seems to be needed when the file format is SAM // because the hts_getline for parsing SAM format files within // sam_read1 only get called when "(fp->line.l == 0)". For BAM @@ -101,12 +96,11 @@ SAMReader::get_sam_record(sam_rec &sr) { } else if (rd_ret == -1) good = false; - else // rd_ret < -1 + else // rd_ret < -1 throw runtime_error("failed to read record from file: " + filename); return good; } -string -SAMReader::get_header() const { - return hdr->text; // includes newline +string SAMReader::get_header() const { + return hdr->text; // includes newline } diff --git a/htslib_wrapper.hpp b/htslib_wrapper.hpp index 1a0f8a5..51c268c 100644 --- a/htslib_wrapper.hpp +++ b/htslib_wrapper.hpp @@ -31,7 +31,7 @@ extern "C" { } extern "C" { - char check_htslib_wrapper(); +char check_htslib_wrapper(); } class SAMReader { @@ -46,6 +46,7 @@ class SAMReader { void add_threads(const size_t n_threads); std::string get_header() const; + private: // data std::string filename; @@ -56,7 +57,6 @@ class SAMReader { bam1_t *b{}; }; -SAMReader & -operator>>(SAMReader &sam_stream, sam_rec &samr); +SAMReader &operator>>(SAMReader &sam_stream, sam_rec &samr); #endif diff --git a/sam_record.cpp b/sam_record.cpp index 41996d4..2b04839 100644 --- a/sam_record.cpp +++ b/sam_record.cpp @@ -18,44 +18,42 @@ #include "sam_record.hpp" +#include #include #include -#include #include "cigar_utils.hpp" #include "smithlab_utils.hpp" -using std::string; -using std::istringstream; -using std::runtime_error; -using std::regex; -using std::istream; -using std::ostream; using std::begin; using std::end; +using std::istream; +using std::istringstream; +using std::ostream; using std::ostringstream; +using std::regex; +using std::runtime_error; +using std::string; using std::to_string; // ADS: this is for debugging purposes -string -format_sam_flags(const uint16_t the_flags) { +string format_sam_flags(const uint16_t the_flags) { ostringstream oss; using samflags::check; oss << "read_paired: " << check(the_flags, samflags::read_paired) << '\n' - << "read_pair_mapped: " - << check(the_flags, samflags::read_pair_mapped) << '\n' + << "read_pair_mapped: " << check(the_flags, samflags::read_pair_mapped) + << '\n' << "read_unmapped: " << check(the_flags, samflags::read_unmapped) << '\n' << "mate_unmapped: " << check(the_flags, samflags::mate_unmapped) << '\n' << "read_rc: " << check(the_flags, samflags::read_rc) << '\n' << "mate_rc: " << check(the_flags, samflags::mate_rc) << '\n' - << "template_first: " - << check(the_flags, samflags::template_first) << '\n' + << "template_first: " << check(the_flags, samflags::template_first) + << '\n' << "template_last: " << check(the_flags, samflags::template_last) << '\n' << "secondary_aln: " << check(the_flags, samflags::secondary_aln) << '\n' << "below_quality: " << check(the_flags, samflags::below_quality) << '\n' << "pcr_duplicate: " << check(the_flags, samflags::pcr_duplicate) << '\n' - << "supplementary_aln: " - << check(the_flags, samflags::supplementary_aln); + << "supplementary_aln: " << check(the_flags, samflags::supplementary_aln); return oss.str(); } @@ -74,35 +72,25 @@ format_sam_flags(const uint16_t the_flags) { // return regex_match(qual, regex("[!-~]+")); // } -size_t -sam_rec::estimate_line_size() const { +size_t sam_rec::estimate_line_size() const { static const size_t all_field_estimates = 100; return qname.size() + rname.size() + qual.size() + all_field_estimates; } -string -sam_rec::tostring() const { +string sam_rec::tostring() const { string out; out.reserve(estimate_line_size()); - out.append(qname + "\t" + - to_string(flags) + "\t" + - rname + "\t" + - to_string(pos) + "\t" + - to_string(static_cast(mapq)) + "\t" + - cigar + "\t" + - rnext + "\t" + - to_string(pnext) + "\t" + - to_string(tlen) + "\t" + - seq + "\t" + - qual); + out.append(qname + "\t" + to_string(flags) + "\t" + rname + "\t" + + to_string(pos) + "\t" + to_string(static_cast(mapq)) + + "\t" + cigar + "\t" + rnext + "\t" + to_string(pnext) + "\t" + + to_string(tlen) + "\t" + seq + "\t" + qual); for (auto it(begin(tags)); it != end(tags); ++it) out.append("\t" + *it); return out; } -ostream & -operator<<(std::ostream &the_stream, const sam_rec &r) { +ostream &operator<<(std::ostream &the_stream, const sam_rec &r) { the_stream << r.tostring(); return the_stream; } @@ -111,12 +99,11 @@ sam_rec::sam_rec(const string &line) { /* istringstream iss; // ADS: change to set the buffer from "line" iss.rdbuf()->pubsetbuf(const_cast(line.c_str()), line.size());*/ - istringstream iss(line); // ADS: unfortunate macos stuff? + istringstream iss(line); // ADS: unfortunate macos stuff? int32_t will_become_mapq = 0; // to not read mapq as character // since it's uint8_t - if (!(iss >> - qname >> flags >> rname >> pos >> will_become_mapq >> - cigar >> rnext >> pnext >> tlen >> seq >> qual)) + if (!(iss >> qname >> flags >> rname >> pos >> will_become_mapq >> cigar >> + rnext >> pnext >> tlen >> seq >> qual)) throw runtime_error("incorrect SAM record:\n" + line); if (will_become_mapq < 0 || will_become_mapq > 255) throw runtime_error("invalid mapq in SAM record: " + line); @@ -127,13 +114,9 @@ sam_rec::sam_rec(const string &line) { tags.push_back(tmp); } -bool -sam_rec::empty() const { - return pos == 0; -} +bool sam_rec::empty() const { return pos == 0; } -void -inflate_with_cigar(const sam_rec &sr, string &to_inflate, - const char inflation_symbol) { +void inflate_with_cigar(const sam_rec &sr, string &to_inflate, + const char inflation_symbol) { apply_cigar(sr.cigar, to_inflate, inflation_symbol); } diff --git a/sam_record.hpp b/sam_record.hpp index 4c9bcf8..7b837b4 100644 --- a/sam_record.hpp +++ b/sam_record.hpp @@ -19,12 +19,12 @@ #ifndef SAM_RECORD_HPP #define SAM_RECORD_HPP -#include -#include +#include #include -#include #include -#include +#include +#include +#include // from 30 April 2020 SAM documentation // 1 0x1 template having multiple segments in sequencing @@ -42,30 +42,28 @@ namespace samflags { - // ADS: names of flags adjusted to how we typically interpret - static const uint16_t read_paired = 0x1; - static const uint16_t read_pair_mapped = 0x2; - static const uint16_t read_unmapped = 0x4; - static const uint16_t mate_unmapped = 0x8; - static const uint16_t read_rc = 0x10; - static const uint16_t mate_rc = 0x20; - static const uint16_t template_first = 0x40; - static const uint16_t template_last = 0x80; - static const uint16_t secondary_aln = 0x100; - static const uint16_t below_quality = 0x200; - static const uint16_t pcr_duplicate = 0x400; - static const uint16_t supplementary_aln = 0x800; - - constexpr bool - check(const uint16_t to_check, const uint16_t &f) {return to_check & f;} - inline void - set(uint16_t &to_set, const uint16_t f) {to_set |= f;} - inline void - unset(uint16_t &to_unset, const uint16_t f) {to_unset &= ~f;} +// ADS: names of flags adjusted to how we typically interpret +static const uint16_t read_paired = 0x1; +static const uint16_t read_pair_mapped = 0x2; +static const uint16_t read_unmapped = 0x4; +static const uint16_t mate_unmapped = 0x8; +static const uint16_t read_rc = 0x10; +static const uint16_t mate_rc = 0x20; +static const uint16_t template_first = 0x40; +static const uint16_t template_last = 0x80; +static const uint16_t secondary_aln = 0x100; +static const uint16_t below_quality = 0x200; +static const uint16_t pcr_duplicate = 0x400; +static const uint16_t supplementary_aln = 0x800; + +constexpr bool check(const uint16_t to_check, const uint16_t &f) { + return to_check & f; } +inline void set(uint16_t &to_set, const uint16_t f) { to_set |= f; } +inline void unset(uint16_t &to_unset, const uint16_t f) { to_unset &= ~f; } +} // namespace samflags -std::string -format_sam_flags(const uint16_t the_flags); +std::string format_sam_flags(const uint16_t the_flags); // Col Field Type Regexp/Range Brief description // 1 QNAME Query template NAME @@ -77,8 +75,8 @@ format_sam_flags(const uint16_t the_flags); // 7 RNEXT Reference name of the mate/next read // 8 PNEXT Position of the mate/next read // 9 TLEN observed Template LENgth -//10 SEQ segment SEQuence -//11 QUAL Phred-scaled base QUALity+33 +// 10 SEQ segment SEQuence +// 11 QUAL Phred-scaled base QUALity+33 class sam_rec { public: @@ -97,65 +95,44 @@ class sam_rec { std::vector tags; sam_rec() : flags(0), pos(0), mapq(255), pnext(0), tlen(0) {} explicit sam_rec(const std::string &line); - sam_rec(const std::string &_qname, - const uint16_t _flags, - const std::string &_rname, - const uint32_t &_pos, - const uint8_t &_mapq, - const std::string &_cigar, - const std::string &_rnext, - const uint32_t &_pnext, - const int32_t &_tlen, - const std::string &_seq, - const std::string &_qual) : - qname(_qname), - flags(_flags), - rname(_rname), - pos(_pos), - mapq(_mapq), - cigar(_cigar), - rnext(_rnext), - pnext(_pnext), - tlen(_tlen), - seq(_seq), - qual(_qual) {} - void add_tag(const std::string &the_tag) {tags.push_back(the_tag);} + sam_rec(const std::string &_qname, const uint16_t _flags, + const std::string &_rname, const uint32_t &_pos, const uint8_t &_mapq, + const std::string &_cigar, const std::string &_rnext, + const uint32_t &_pnext, const int32_t &_tlen, const std::string &_seq, + const std::string &_qual) + : qname(_qname), flags(_flags), rname(_rname), pos(_pos), mapq(_mapq), + cigar(_cigar), rnext(_rnext), pnext(_pnext), tlen(_tlen), seq(_seq), + qual(_qual) {} + void add_tag(const std::string &the_tag) { tags.push_back(the_tag); } bool empty() const; size_t estimate_line_size() const; std::string tostring() const; }; -inline bool -check_flag(const sam_rec &sr, const uint16_t f) { +inline bool check_flag(const sam_rec &sr, const uint16_t f) { return samflags::check(sr.flags, f); } -inline void -set_flag(sam_rec &sr, const uint16_t f) { +inline void set_flag(sam_rec &sr, const uint16_t f) { return samflags::set(sr.flags, f); } -inline void -unset_flag(sam_rec &sr, const uint16_t f) { +inline void unset_flag(sam_rec &sr, const uint16_t f) { return samflags::unset(sr.flags, f); } -std::istream & -operator>>(std::istream &in, sam_rec &r); +std::istream &operator>>(std::istream &in, sam_rec &r); -std::ostream & -operator<<(std::ostream &the_stream, const sam_rec &r); +std::ostream &operator<<(std::ostream &the_stream, const sam_rec &r); -void -inflate_with_cigar(const sam_rec &sr, std::string &to_inflate, - const char inflation_symbol = 'N'); +void inflate_with_cigar(const sam_rec &sr, std::string &to_inflate, + const char inflation_symbol = 'N'); // assumes program name is at argv[0] -template -void -write_pg_line(const int argc, const char **argv, - const std::string &program_name, - const std::string &program_version, T &out) { +template +void write_pg_line(const int argc, const char **argv, + const std::string &program_name, + const std::string &program_version, T &out) { out << "@PG\t"; // empty program, should never happen @@ -170,40 +147,37 @@ write_pg_line(const int argc, const char **argv, std::ostringstream the_command; copy(argv, argv + argc, - std::ostream_iterator(the_command, " ")); + std::ostream_iterator(the_command, " ")); out << "CL:\"" << the_command.str() << "\"" << std::endl; } } -template -static void -write_sam_header(const std::vector &chrom_names, - const std::vector &chrom_sizes, - const std::string program_name, - const std::string program_version, - const int argc, const char **argv, - std::ostream &out) { +template +static void write_sam_header(const std::vector &chrom_names, + const std::vector &chrom_sizes, + const std::string program_name, + const std::string program_version, const int argc, + const char **argv, std::ostream &out) { static const std::string SAM_VERSION = "1.0"; // sam version - out <<"@HD" << '\t' << "VN:" << SAM_VERSION << '\n'; // sam version + out << "@HD" << '\t' << "VN:" << SAM_VERSION << '\n'; // sam version // chromosome sizes const size_t n_chroms = chrom_names.size(); for (size_t i = 0; i < n_chroms; ++i) { - out << "@SQ" << '\t' - << "SN:" << chrom_names[i] << '\t' + out << "@SQ" << '\t' << "SN:" << chrom_names[i] << '\t' << "LN:" << chrom_sizes[i] << '\n'; } // program details - out << "@PG" << '\t' - << "ID:" << program_name << '\t' + out << "@PG" << '\t' << "ID:" << program_name << '\t' << "VN:" << program_version << '\t'; // how the program was run std::ostringstream the_command; - copy(argv, argv + argc, std::ostream_iterator(the_command, " ")); + copy(argv, argv + argc, + std::ostream_iterator(the_command, " ")); out << "CL:\"" << the_command.str() << "\"" << std::endl; } diff --git a/smithlab_os.cpp b/smithlab_os.cpp index a1688ab..8e33d46 100644 --- a/smithlab_os.cpp +++ b/smithlab_os.cpp @@ -16,35 +16,33 @@ * General Public License for more details. */ -#include -#include #include -#include -#include #include -#include +#include #include #include +#include +#include +#include +#include +#include "QualityScore.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" -#include "QualityScore.hpp" -#include #include +#include #include #include -#include -using std::string; -using std::vector; +using std::begin; using std::ios_base; -using std::unordered_map; using std::runtime_error; -using std::begin; +using std::string; +using std::unordered_map; +using std::vector; -string -strip_path(string full_path) { +string strip_path(string full_path) { size_t start = full_path.find_last_of('/'); if (start == string::npos) start = 0; @@ -53,8 +51,7 @@ strip_path(string full_path) { return full_path.substr(start); } -string -strip_path_and_suffix(string full_path) { +string strip_path_and_suffix(string full_path) { size_t start = full_path.find_last_of('/'); if (start == string::npos) start = 0; @@ -66,28 +63,27 @@ strip_path_and_suffix(string full_path) { return full_path.substr(start, end - start); } -void -parse_dir_basename_suffix(const string &full_path, - string &dirname, - string &base_name, string &suffix) { +void parse_dir_basename_suffix(const string &full_path, string &dirname, + string &base_name, string &suffix) { const size_t base_index = full_path.find_last_of("/\\"); if (base_index != string::npos) dirname = full_path.substr(0, base_index); - else dirname = ""; + else + dirname = ""; const size_t suffix_index = full_path.find_last_of("."); if (suffix_index != string::npos && (base_index == string::npos || suffix_index > base_index)) suffix = string(begin(full_path) + suffix_index + 1, end(full_path)); - else suffix = ""; + else + suffix = ""; base_name = string(begin(full_path) + base_index + 1, begin(full_path) + suffix_index); } -void -format_dir_basename_suffix(const string &dn, const string &bn, - const string &sf, string &full_path) { +void format_dir_basename_suffix(const string &dn, const string &bn, + const string &sf, string &full_path) { full_path = dn; if (!dn.empty() && bn.back() != '/') full_path += '/'; @@ -98,15 +94,13 @@ format_dir_basename_suffix(const string &dn, const string &bn, full_path += (sf[0] == '.' || full_path.back() == '.') ? sf : "." + sf; } -bool -isdir(const char *filename) { +bool isdir(const char *filename) { struct stat buffer; stat(filename, &buffer); return S_ISDIR(buffer.st_mode); } -bool -is_fastq(const string &filename) { +bool is_fastq(const string &filename) { std::ifstream f(filename.c_str()); char c = '\0'; f >> c; @@ -117,14 +111,12 @@ is_fastq(const string &filename) { //////////////////////////////////////////////////////////////////////// // Stuff dealing with FASTA format sequence files -bool -is_valid_filename(const string &name, const string &filename_suffix) { +bool is_valid_filename(const string &name, const string &filename_suffix) { const string suffix(name.substr(name.find_last_of(".") + 1)); return (suffix == filename_suffix); } -string -path_join(const string &a, const string &b) { +string path_join(const string &a, const string &b) { if (b.empty() || b[0] == '/') throw runtime_error("cannot prepend dir to file \"" + b + "\""); if (!a.empty() && a[a.length() - 1] == '/') @@ -133,22 +125,21 @@ path_join(const string &a, const string &b) { return a + "/" + b; } -void -identify_chromosomes(const string &chrom_file, const string fasta_suffix, - unordered_map &chrom_files) { +void identify_chromosomes(const string &chrom_file, const string fasta_suffix, + unordered_map &chrom_files) { vector the_files; if (isdir(chrom_file.c_str())) { read_dir(chrom_file, fasta_suffix, the_files); for (size_t i = 0; i < the_files.size(); ++i) chrom_files[strip_path_and_suffix(the_files[i])] = the_files[i]; } - else chrom_files[strip_path_and_suffix(chrom_file)] = chrom_file; + else + chrom_files[strip_path_and_suffix(chrom_file)] = chrom_file; } -void -identify_and_read_chromosomes(const string &chrom_file, - const string fasta_suffix, - unordered_map &chrom_files) { +void identify_and_read_chromosomes(const string &chrom_file, + const string fasta_suffix, + unordered_map &chrom_files) { vector the_files; if (isdir(chrom_file.c_str())) { read_dir(chrom_file, fasta_suffix, the_files); @@ -164,9 +155,8 @@ identify_and_read_chromosomes(const string &chrom_file, } } -void -read_dir(const string& dirname, string filename_suffix, - vector &filenames) { +void read_dir(const string &dirname, string filename_suffix, + vector &filenames) { DIR *dir; if (!(dir = opendir(dirname.c_str()))) throw runtime_error("could not open directory: " + dirname); @@ -186,41 +176,32 @@ read_dir(const string& dirname, string filename_suffix, closedir(dir); } -bool -is_sequence_line(const char *buffer) { - return isvalid(buffer[0]); -} +bool is_sequence_line(const char *buffer) { return isvalid(buffer[0]); } -void -parse_score_line(const char *buffer, vector &scr) { +void parse_score_line(const char *buffer, vector &scr) { for (const char *i = buffer; *i != '\0'; ++i) scr.push_back(*i); } -inline bool -is_fastq_name_line(size_t line_count) { +inline bool is_fastq_name_line(size_t line_count) { return ((line_count & 3ul) == 0ul); } -inline bool -is_fastq_sequence_line(size_t line_count) { +inline bool is_fastq_sequence_line(size_t line_count) { return ((line_count & 3ul) == 1ul); } -inline bool -is_fastq_score_name_line(size_t line_count) { +inline bool is_fastq_score_name_line(size_t line_count) { return ((line_count & 3ul) == 2ul); } -inline bool -is_fastq_score_line(size_t line_count) { +inline bool is_fastq_score_line(size_t line_count) { return ((line_count & 3ul) == 3ul); } -void -read_fastq_file(const char *filename, - vector &names, - vector &sequences, vector > &scores) { +void read_fastq_file(const char *filename, vector &names, + vector &sequences, + vector> &scores) { static const size_t INPUT_BUFFER_SIZE = 1000000; @@ -230,7 +211,7 @@ read_fastq_file(const char *filename, string s, name; vector scr; - vector > scrs; + vector> scrs; bool first_line = true; // ADS: preprocessor stuff below is because is_sequence_line is only // used with asserts; consider removing variable @@ -242,9 +223,8 @@ read_fastq_file(const char *filename, char buffer[INPUT_BUFFER_SIZE + 1]; in.getline(buffer, INPUT_BUFFER_SIZE); if (in.gcount() == static_cast(INPUT_BUFFER_SIZE)) - throw runtime_error( - "Line in " + name + "\nexceeds max length: " - + toa(INPUT_BUFFER_SIZE)); + throw runtime_error("Line in " + name + + "\nexceeds max length: " + toa(INPUT_BUFFER_SIZE)); if (in.gcount() == 0) break; @@ -259,7 +239,8 @@ read_fastq_file(const char *filename, names.push_back(name); sequences.push_back(s); scrs.push_back(scr); - } else + } + else first_line = false; name = buffer; name = name.substr(name.find_first_not_of("@ ")); @@ -278,8 +259,7 @@ read_fastq_file(const char *filename, } if (is_fastq_score_name_line(line_count)) { if (buffer[0] != '+') - throw runtime_error("invalid FASTQ score name line: " + - string(buffer)); + throw runtime_error("invalid FASTQ score name line: " + string(buffer)); } if (is_fastq_score_line(line_count)) { parse_score_line(buffer, scr); @@ -294,12 +274,14 @@ read_fastq_file(const char *filename, bool phred_scores = true, solexa_scores = true; for (size_t i = 0; i < scrs.size() && phred_scores && solexa_scores; ++i) { - phred_scores = phred_scores && - (find_if(begin(scrs[i]), end(scrs[i]), - [](char c) {return !valid_phred_score(c);}) == end(scrs[i])); - solexa_scores = solexa_scores && - (find_if(begin(scrs[i]), end(scrs[i]), - [](char c) {return !valid_solexa_score(c);}) == end(scrs[i])); + phred_scores = + phred_scores && (find_if(begin(scrs[i]), end(scrs[i]), [](char c) { + return !valid_phred_score(c); + }) == end(scrs[i])); + solexa_scores = + solexa_scores && (find_if(begin(scrs[i]), end(scrs[i]), [](char c) { + return !valid_solexa_score(c); + }) == end(scrs[i])); } if (!phred_scores && !solexa_scores) @@ -308,10 +290,9 @@ read_fastq_file(const char *filename, for (size_t i = 0; i < scrs.size(); ++i) { scores.push_back(vector(scrs[i].size())); for (size_t j = 0; j < scrs[i].size(); ++j) - scores[i][j] = - (solexa_scores) ? - quality_character_to_solexa(scrs[i][j] - 5) : - quality_character_to_phred(scrs[i][j]); + scores[i][j] = (solexa_scores) + ? quality_character_to_solexa(scrs[i][j] - 5) + : quality_character_to_phred(scrs[i][j]); scrs[i].clear(); } } @@ -337,9 +318,8 @@ void read_fastq_file(const char *filename, vector &names, char buffer[INPUT_BUFFER_SIZE + 1]; in.getline(buffer, INPUT_BUFFER_SIZE); if (in.gcount() == static_cast(INPUT_BUFFER_SIZE)) - throw runtime_error( - "Line in " + name + "\nexceeds max length: " - + toa(INPUT_BUFFER_SIZE)); + throw runtime_error("Line in " + name + + "\nexceeds max length: " + toa(INPUT_BUFFER_SIZE)); if (in.gcount() == 0) break; @@ -354,7 +334,8 @@ void read_fastq_file(const char *filename, vector &names, names.push_back(name); sequences.push_back(s); scores.push_back(scr); - } else + } + else first_line = false; name = buffer; name = name.substr(name.find_first_not_of("@ ")); @@ -371,8 +352,7 @@ void read_fastq_file(const char *filename, vector &names, } if (is_fastq_score_name_line(line_count)) { if (buffer[0] != '+') - throw runtime_error("invalid FASTQ score name line: " + - string(buffer)); + throw runtime_error("invalid FASTQ score name line: " + string(buffer)); } if (is_fastq_score_line(line_count)) { scr = buffer; @@ -386,10 +366,8 @@ void read_fastq_file(const char *filename, vector &names, } } -void -read_fasta_file_short_names(const string &filename, - vector &names, - vector &sequences) { +void read_fasta_file_short_names(const string &filename, vector &names, + vector &sequences) { std::ifstream in(filename); if (!in) @@ -406,17 +384,16 @@ read_fasta_file_short_names(const string &filename, names.push_back(line.substr(1)); else names.push_back(string(begin(line) + 1, - begin(line) + line.find_first_of(" \t", 1))); + begin(line) + line.find_first_of(" \t", 1))); sequences.push_back(string()); } - else sequences.back() += line; + else + sequences.back() += line; } } -void -read_fasta_file(const string &filename, - vector &names, - vector &sequences) { +void read_fasta_file(const string &filename, vector &names, + vector &sequences) { std::ifstream in(filename.c_str()); if (!in) @@ -432,14 +409,13 @@ read_fasta_file(const string &filename, names.push_back(line.substr(1)); sequences.push_back(string()); } - else sequences.back() += line; + else + sequences.back() += line; } } -void -read_fasta_file(const string &filename, - const string &target, - string &sequence) { +void read_fasta_file(const string &filename, const string &target, + string &sequence) { // read the sequence with the given name from a fasta file @@ -457,8 +433,8 @@ read_fasta_file(const string &filename, char buffer[INPUT_BUFFER_SIZE + 1]; in.getline(buffer, INPUT_BUFFER_SIZE); if (in.gcount() == static_cast(INPUT_BUFFER_SIZE)) - throw runtime_error("Line in " + name + "\nexceeds max length: " - + toa(INPUT_BUFFER_SIZE)); + throw runtime_error("Line in " + name + + "\nexceeds max length: " + toa(INPUT_BUFFER_SIZE)); // correct for dos carriage returns before newlines if (buffer[strlen(buffer) - 1] == '\r') buffer[strlen(buffer) - 1] = '\0'; @@ -468,7 +444,8 @@ read_fasta_file(const string &filename, in.close(); return; } - else first_line = false; + else + first_line = false; name = buffer; name = name.substr(name.find_first_not_of("> ")); const size_t first_whitespace = name.find_first_of(" \t"); @@ -479,14 +456,13 @@ read_fasta_file(const string &filename, else if (name == target) s += buffer; in.peek(); - } //while + } // while if (!first_line && s.length() > 0 && name == target) std::swap(sequence, s); } -void -read_filename_file(const char *filename, vector &filenames) { +void read_filename_file(const char *filename, vector &filenames) { static const size_t INPUT_BUFFER_SIZE = 1000000; @@ -499,15 +475,13 @@ read_filename_file(const char *filename, vector &filenames) { in.getline(buffer, INPUT_BUFFER_SIZE); if (in.gcount() == static_cast(INPUT_BUFFER_SIZE)) throw runtime_error("Line in " + string(filename) + - "\nexceeds max length: " - + toa(INPUT_BUFFER_SIZE)); + "\nexceeds max length: " + toa(INPUT_BUFFER_SIZE)); filenames.push_back(buffer); in.peek(); } } -size_t -get_filesize(string filename) { +size_t get_filesize(string filename) { std::ifstream f(filename.c_str()); if (!f.good()) return 0; @@ -520,8 +494,7 @@ get_filesize(string filename) { return end_pos - begin_pos; } -string -basename(string filename) { +string basename(string filename) { const string s(filename.substr(0, filename.find_last_of("."))); const size_t final_slash = s.find_last_of("/"); if (final_slash != string::npos) @@ -530,8 +503,7 @@ basename(string filename) { return s; } -void -read_dir(const string& dirname, vector &filenames) { +void read_dir(const string &dirname, vector &filenames) { DIR *dir; if (!(dir = opendir(dirname.c_str()))) throw "could not open directory: " + dirname; @@ -550,8 +522,7 @@ read_dir(const string& dirname, vector &filenames) { closedir(dir); } -bool -is_valid_output_file(const string &filename) { +bool is_valid_output_file(const string &filename) { // ADS: seems like there is no way around "access" and apparently // access is not a great solution anyway. if (std::filesystem::exists(filename)) @@ -569,9 +540,8 @@ is_valid_output_file(const string &filename) { } } -bool -has_gz_ext(const string &filename) { +bool has_gz_ext(const string &filename) { const string ext(".gz"); - return filename.size() >= ext.size() - && filename.compare(filename.size() - ext.size(), ext.size(), ext) == 0; + return filename.size() >= ext.size() && + filename.compare(filename.size() - ext.size(), ext.size(), ext) == 0; } diff --git a/smithlab_os.hpp b/smithlab_os.hpp index abaa525..c647d70 100644 --- a/smithlab_os.hpp +++ b/smithlab_os.hpp @@ -20,100 +20,73 @@ #define SMITHLAB_OS_HPP #include -#include #include +#include -bool -isdir(const char *filename); - -bool -is_fastq(const std::string &filename); - -bool -is_valid_filename(const std::string &name, - const std::string &filename_suffix); - -std::string -path_join(const std::string &a, const std::string &b); - -void -identify_chromosomes(const std::string &chrom_file, - const std::string fasta_suffix, // could be const char* - std::unordered_map &chrom_files); - -void -identify_and_read_chromosomes(const std::string &chrom_file, - const std::string fasta_suffix, - std::unordered_map &chrom_files); - -void -read_dir(const std::string &dirname, - std::string filename_suffix, // could be const char* - std::vector &filenames); - -void -read_fasta_file(const std::string &filename, - std::vector &names, - std::vector &sequences); -void -read_fasta_file_short_names(const std::string &filename, - std::vector &names, - std::vector &sequences); +bool isdir(const char *filename); -// This verstion looks for the sequence matching a particular name -void -read_fasta_file(const std::string &filename, - const std::string &target, - std::string &sequence); +bool is_fastq(const std::string &filename); + +bool is_valid_filename(const std::string &name, + const std::string &filename_suffix); + +std::string path_join(const std::string &a, const std::string &b); -void -read_fastq_file(const char *filename, - std::vector &names, - std::vector &sequences, - std::vector > &scores); +void identify_chromosomes( + const std::string &chrom_file, + const std::string fasta_suffix, // could be const char* + std::unordered_map &chrom_files); + +void identify_and_read_chromosomes( + const std::string &chrom_file, const std::string fasta_suffix, + std::unordered_map &chrom_files); + +void read_dir(const std::string &dirname, + std::string filename_suffix, // could be const char* + std::vector &filenames); + +void read_fasta_file(const std::string &filename, + std::vector &names, + std::vector &sequences); +void read_fasta_file_short_names(const std::string &filename, + std::vector &names, + std::vector &sequences); + +// This verstion looks for the sequence matching a particular name +void read_fasta_file(const std::string &filename, const std::string &target, + std::string &sequence); -void -read_fastq_file(const char *filename, - std::vector &names, - std::vector &sequences, - std::vector &scores); +void read_fastq_file(const char *filename, std::vector &names, + std::vector &sequences, + std::vector> &scores); -void -read_filename_file(const char *filename, - std::vector &filenames); +void read_fastq_file(const char *filename, std::vector &names, + std::vector &sequences, + std::vector &scores); -size_t -get_filesize(std::string filename); +void read_filename_file(const char *filename, + std::vector &filenames); -std::string -basename(std::string filename); +size_t get_filesize(std::string filename); -void -parse_dir_basename_suffix(const std::string &full_path, - std::string &dirname, - std::string &base_name, - std::string &suffix); +std::string basename(std::string filename); -void -format_dir_basename_suffix(const std::string &directory_name, - const std::string &file_base_name, - const std::string &file_name_suffix, - std::string &full_path); +void parse_dir_basename_suffix(const std::string &full_path, + std::string &dirname, std::string &base_name, + std::string &suffix); -std::string -strip_path(std::string full_path); +void format_dir_basename_suffix(const std::string &directory_name, + const std::string &file_base_name, + const std::string &file_name_suffix, + std::string &full_path); -std::string -strip_path_and_suffix(std::string full_path); +std::string strip_path(std::string full_path); -void -read_dir(const std::string& dirname, std::vector &filenames); +std::string strip_path_and_suffix(std::string full_path); -bool -is_valid_output_file(const std::string &filename); +void read_dir(const std::string &dirname, std::vector &filenames); +bool is_valid_output_file(const std::string &filename); bool has_gz_ext(const std::string &filename); diff --git a/smithlab_utils.cpp b/smithlab_utils.cpp index edd1e2b..c21f71e 100644 --- a/smithlab_utils.cpp +++ b/smithlab_utils.cpp @@ -21,21 +21,21 @@ */ #include "smithlab_utils.hpp" +#include #include #include -#include -using std::vector; using std::string; +using std::vector; -char have_smithlab_cpp() {return 1;} +char have_smithlab_cpp() { return 1; } /* Performs Hochberg step-up p-value adjustment. */ void smithlab::correct_pvals(const size_t n_tests, vector &pvals) { assert(!pvals.empty()); - vector > > idx; + vector>> idx; for (size_t i = 0; i < pvals.size(); ++i) idx.push_back(std::make_pair(pvals[i], vector(1, i))); std::sort(idx.begin(), idx.end()); @@ -44,43 +44,44 @@ void smithlab::correct_pvals(const size_t n_tests, vector &pvals) { for (size_t i = 1; i < idx.size(); ++i) if (idx[j].first == idx[i].first) idx[j].second.push_back(idx[i].second.front()); - else std::swap(idx[++j], idx[i]); + else + std::swap(idx[++j], idx[i]); idx.erase(idx.begin() + j + 1, idx.end()); double running_count = idx.front().second.size(); - idx.front().first *= (n_tests/running_count); + idx.front().first *= (n_tests / running_count); for (size_t i = 1; i < idx.size(); ++i) { running_count += idx[i].second.size(); - idx[i].first = std::max(idx[i-1].first, idx[i].first*(n_tests/running_count)); + idx[i].first = + std::max(idx[i - 1].first, idx[i].first * (n_tests / running_count)); } for (size_t i = 0; i < idx.size(); ++i) { for (size_t j = 0; j < idx[i].second.size(); ++j) pvals[idx[i].second[j]] = idx[i].first; } - } /* Benjamini-Hochberg step-up FDR cutoff */ -double -smithlab::get_fdr_cutoff(const size_t n_tests, vector &pvals, - const double alpha) { - if (alpha <= 0) return std::numeric_limits::max(); - else if (alpha > 1) return std::numeric_limits::min(); +double smithlab::get_fdr_cutoff(const size_t n_tests, vector &pvals, + const double alpha) { + if (alpha <= 0) + return std::numeric_limits::max(); + else if (alpha > 1) + return std::numeric_limits::min(); std::sort(pvals.begin(), pvals.end()); const size_t n_pvals = pvals.size(); size_t i = 0; - while (i < n_pvals - 1 && pvals[i+1] < - alpha*static_cast(i+1)/n_tests) + while (i < n_pvals - 1 && + pvals[i + 1] < alpha * static_cast(i + 1) / n_tests) ++i; return pvals[i]; } -char -complement(int i) { +char complement(int i) { static const int b2c_size = 20; // clang-format off static const char b2c[] = { @@ -96,19 +97,19 @@ complement(int i) { return b2c[i - 'A']; else if (i - 'a' >= 0 && i - 'a' < b2c_size) return b2cl[i - 'a']; - else return 'N'; + else + return 'N'; } - -std::vector -smithlab::split(std::string s, const char *delim, bool get_empty_fields) { - std::vector parts; +std::vector smithlab::split(std::string s, const char *delim, + bool get_empty_fields) { + std::vector parts; size_t i = 0, j = 0, dlen = strlen(delim); while (i < s.length()) { bool prev_in_delim = false; if (i > 0) for (size_t k = 0; k < dlen; ++k) - if (s[i-1] == delim[k]) + if (s[i - 1] == delim[k]) prev_in_delim = true; bool curr_in_delim = false; for (size_t k = 0; k < dlen; ++k) @@ -125,8 +126,7 @@ smithlab::split(std::string s, const char *delim, bool get_empty_fields) { return parts; } -std::string -smithlab::strip(const std::string& s) { +std::string smithlab::strip(const std::string &s) { const size_t len = s.length(); size_t i = 0; while (i < len && isspace(s[i])) { @@ -139,15 +139,14 @@ smithlab::strip(const std::string& s) { j++; if (i == 0 && j == len) return s; - else return s.substr(i, j - i); + else + return s.substr(i, j - i); } -void -smithlab::split_whitespace(const string &s, vector &v) { +void smithlab::split_whitespace(const string &s, vector &v) { v.clear(); std::istringstream iss(s); - copy(std::istream_iterator(iss), - std::istream_iterator(), + copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(v)); } @@ -186,23 +185,23 @@ smithlab::split_whitespace_quoted(std::string to_split) { } if (to_split[end_pos] == '\\') ++end_pos; - else break; + else + break; } while (true); ++end_pos; } if (end_pos >= length_of_to_split) { - words.push_back(to_split.substr(start_pos, - end_pos - start_pos)); + words.push_back(to_split.substr(start_pos, end_pos - start_pos)); break; } } return words; } -double -smithlab::log_sum_log_vec(const std::vector &vals, size_t limit) { +double smithlab::log_sum_log_vec(const std::vector &vals, + size_t limit) { const std::vector::const_iterator x = - max_element(vals.begin(), vals.begin() + limit); + max_element(vals.begin(), vals.begin() + limit); const double max_val = *x; const size_t max_idx = x - vals.begin(); double sum = 1.0; @@ -215,25 +214,24 @@ smithlab::log_sum_log_vec(const std::vector &vals, size_t limit) { /**** * @summary: remove empty (or only whitespace) strings from a vector of string */ -std::vector -smithlab::squash(const std::vector& v) { +std::vector smithlab::squash(const std::vector &v) { std::vector res; for (size_t i = 0; i < v.size(); i++) { std::string t = v[i]; strip(t); - if (t != "") res.push_back(t); + if (t != "") + res.push_back(t); } return res; } -void -ProgressBar::report(std::ostream &out, const size_t i) { - prev = std::round((100.0*std::min(i, total))/total); +void ProgressBar::report(std::ostream &out, const size_t i) { + prev = std::round((100.0 * std::min(i, total)) / total); const size_t x = - std::min(static_cast(bar_width*(prev/100.0)), bar_width); + std::min(static_cast(bar_width * (prev / 100.0)), bar_width); fill_n(begin(bar), x, '='); - out << left_tag << mid_tag << "|" << bar << "|" - << std::setw(3) << prev << right_tag; + out << left_tag << mid_tag << "|" << bar << "|" << std::setw(3) << prev + << right_tag; if (i >= total) out << '\n'; } diff --git a/smithlab_utils.hpp b/smithlab_utils.hpp index 089e428..f47d97c 100644 --- a/smithlab_utils.hpp +++ b/smithlab_utils.hpp @@ -24,235 +24,286 @@ #ifndef SMITHLAB_UTILS_HPP #define SMITHLAB_UTILS_HPP -#include -#include #include -#include -#include -#include -#include -#include #include #include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include -extern "C" {char have_smithlab_cpp();} +extern "C" { +char have_smithlab_cpp(); +} namespace smithlab { - template Out - copy_if(In first, In last, Out res, Pred p) { - while (first != last) { - if (p(*first)) *res++ = *first; - ++first; - } - return res; +template +Out copy_if(In first, In last, Out res, Pred p) { + while (first != last) { + if (p(*first)) + *res++ = *first; + ++first; } -}; + return res; +} +}; // namespace smithlab typedef size_t MASK_t; namespace smithlab { - // Code dealing with false discovery rate - double get_fdr_cutoff(const size_t n_tests, std::vector &pvals, - const double alpha); +// Code dealing with false discovery rate +double get_fdr_cutoff(const size_t n_tests, std::vector &pvals, + const double alpha); - void correct_pvals(const size_t n_tests, std::vector &pvals); +void correct_pvals(const size_t n_tests, std::vector &pvals); - // Assumes 4 nucleotide DNA alphabet - static const size_t alphabet_size = 4; +// Assumes 4 nucleotide DNA alphabet +static const size_t alphabet_size = 4; - std::vector split(std::string, const char *, - bool get_empty_fields = false); - std::vector split_whitespace_quoted(std::string to_split); - void split_whitespace(const std::string& s, std::vector &v); +std::vector split(std::string, const char *, + bool get_empty_fields = false); +std::vector split_whitespace_quoted(std::string to_split); +void split_whitespace(const std::string &s, std::vector &v); - std::string strip(const std::string& s); +std::string strip(const std::string &s); - std::vector - squash(const std::vector &v); +std::vector squash(const std::vector &v); - template std::string toa(T t) { - std::ostringstream s; - s << t; - return s.str(); - } - double log_sum_log_vec(const std::vector &vals, size_t lim); - - template double - log_sum_log(FwrdItr first, FwrdItr last) { - const auto max_itr = std::max_element(first, last); - const double max_val = *max_itr; - double sum = 1.0; - for (auto itr = first; itr != last; ++itr) - sum += (itr == max_itr ? 0.0 : std::exp(*itr - max_val)); - return max_val + std::log(sum); +template std::string toa(T t) { + std::ostringstream s; + s << t; + return s.str(); +} +double log_sum_log_vec(const std::vector &vals, size_t lim); + +template double log_sum_log(FwrdItr first, FwrdItr last) { + const auto max_itr = std::max_element(first, last); + const double max_val = *max_itr; + double sum = 1.0; + for (auto itr = first; itr != last; ++itr) + sum += (itr == max_itr ? 0.0 : std::exp(*itr - max_val)); + return max_val + std::log(sum); +} +inline double log_sum_log(const double p, const double q) { + if (p == 0) { + return q; } - inline double - log_sum_log(const double p, const double q) { - if (p == 0) {return q;} - else if (q == 0) {return p;} - const double larger = (p > q) ? p : q; - return larger + log(1.0 + exp((p > q ? q : p) - larger)); - } - - inline double - log_sum_log(const double p, const double q, const double r) { - return log_sum_log(log_sum_log(p, q), r); + else if (q == 0) { + return p; } + const double larger = (p > q) ? p : q; + return larger + log(1.0 + exp((p > q ? q : p) - larger)); } -namespace smithlab_bits { - static const MASK_t low_bit = 1ul; - static const MASK_t high_bit = static_cast(0x8000000000000000ul); - static const MASK_t all_ones = static_cast(-1); - static const MASK_t all_zeros = 0ul; - static const size_t word_size = 64ul; +inline double log_sum_log(const double p, const double q, const double r) { + return log_sum_log(log_sum_log(p, q), r); } +} // namespace smithlab -inline size_t -base2int_upper_only(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 1; - case 'G' : return 2; - case 'T' : return 3; - default : return 4; +namespace smithlab_bits { +static const MASK_t low_bit = 1ul; +static const MASK_t high_bit = static_cast(0x8000000000000000ul); +static const MASK_t all_ones = static_cast(-1); +static const MASK_t all_zeros = 0ul; +static const size_t word_size = 64ul; +} // namespace smithlab_bits + +inline size_t base2int_upper_only(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + default: + return 4; } } -inline size_t -base2int(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 1; - case 'G' : return 2; - case 'T' : return 3; - case 'a' : return 0; - case 'c' : return 1; - case 'g' : return 2; - case 't' : return 3; - default : return 4; +inline size_t base2int(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + case 'a': + return 0; + case 'c': + return 1; + case 'g': + return 2; + case 't': + return 3; + default: + return 4; } } -inline size_t -base2int_bs_upper_only(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 3; - case 'G' : return 2; - case 'T' : return 3; - default : return 4; +inline size_t base2int_bs_upper_only(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 3; + case 'G': + return 2; + case 'T': + return 3; + default: + return 4; } } -inline size_t -base2int_bs(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 3; - case 'G' : return 2; - case 'T' : return 3; - case 'a' : return 0; - case 'c' : return 3; - case 'g' : return 2; - case 't' : return 3; - default : return 4; +inline size_t base2int_bs(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 3; + case 'G': + return 2; + case 'T': + return 3; + case 'a': + return 0; + case 'c': + return 3; + case 'g': + return 2; + case 't': + return 3; + default: + return 4; } } - -inline size_t -base2int_bs_ag_upper_only(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 1; - case 'G' : return 0; - case 'T' : return 3; - default : return 4; +inline size_t base2int_bs_ag_upper_only(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 0; + case 'T': + return 3; + default: + return 4; } } - - -inline size_t -base2int_bs_ag(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 1; - case 'G' : return 0; - case 'T' : return 3; - case 'a' : return 0; - case 'c' : return 1; - case 'g' : return 0; - case 't' : return 3; - default : return 4; +inline size_t base2int_bs_ag(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 0; + case 'T': + return 3; + case 'a': + return 0; + case 'c': + return 1; + case 'g': + return 0; + case 't': + return 3; + default: + return 4; } } - -inline size_t -base2int_bs_rc(char c) { - switch(c) { - case 'A' : return 0; - case 'C' : return 1; - case 'G' : return 0; - case 'T' : return 3; - case 'a' : return 0; - case 'c' : return 1; - case 'g' : return 0; - case 't' : return 3; +inline size_t base2int_bs_rc(char c) { + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 0; + case 'T': + return 3; + case 'a': + return 0; + case 'c': + return 1; + case 'g': + return 0; + case 't': + return 3; } return 4; } -inline size_t -base2int_rc(char c) { - switch(c) { - case 'A' : return 3; - case 'C' : return 2; - case 'G' : return 1; - case 'T' : return 0; - case 'a' : return 3; - case 'c' : return 2; - case 'g' : return 1; - case 't' : return 0; +inline size_t base2int_rc(char c) { + switch (c) { + case 'A': + return 3; + case 'C': + return 2; + case 'G': + return 1; + case 'T': + return 0; + case 'a': + return 3; + case 'c': + return 2; + case 'g': + return 1; + case 't': + return 0; } return 4; } -inline char -int2base(size_t c) { - switch(c) { - case 0 : return 'A'; - case 1 : return 'C'; - case 2 : return 'G'; - case 3 : return 'T'; +inline char int2base(size_t c) { + switch (c) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; } return 'N'; } -inline char -int2base_rc(size_t c) { - switch(c) { - case 3 : return 'A'; - case 2 : return 'C'; - case 1 : return 'G'; - case 0 : return 'T'; +inline char int2base_rc(size_t c) { + switch (c) { + case 3: + return 'A'; + case 2: + return 'C'; + case 1: + return 'G'; + case 0: + return 'T'; } return 'N'; } -inline bool -isvalid(char c) { - return (base2int(c) != 4); -} +inline bool isvalid(char c) { return (base2int(c) != 4); } -inline std::string -i2mer(size_t n, size_t index) { +inline std::string i2mer(size_t n, size_t index) { std::string s(n, ' '); do { --n; @@ -262,8 +313,7 @@ i2mer(size_t n, size_t index) { return s; } -inline std::string -i2mer_rc(size_t n, size_t index) { +inline std::string i2mer_rc(size_t n, size_t index) { std::string s(n, ' '); do { --n; @@ -273,22 +323,22 @@ i2mer_rc(size_t n, size_t index) { return s; } -inline size_t -mer2i(const std::string::const_iterator a, std::string::const_iterator b) { +inline size_t mer2i(const std::string::const_iterator a, + std::string::const_iterator b) { size_t multiplier = 1, index = 0; do { --b; - index += base2int(*b)*multiplier; + index += base2int(*b) * multiplier; multiplier *= smithlab::alphabet_size; } while (b > a); return index; } -inline size_t -mer2i_rc(std::string::const_iterator a, const std::string::const_iterator b) { +inline size_t mer2i_rc(std::string::const_iterator a, + const std::string::const_iterator b) { size_t multiplier = 1, index = 0; do { - index += base2int_rc(*a)*multiplier; + index += base2int_rc(*a) * multiplier; multiplier *= smithlab::alphabet_size; } while (++a < b); return index; @@ -303,31 +353,27 @@ template std::string toa(T t) { //////////////////////////////////////////////////////////////////////// // Code for dealing with the DNA alphabet -char -complement(int i); +char complement(int i); -inline std::string -revcomp(const std::string& s) { +inline std::string revcomp(const std::string &s) { std::string r; std::transform(s.begin(), s.end(), back_inserter(r), complement); std::reverse(r.begin(), r.end()); return r; } -inline void -revcomp_inplace(std::string& s) { +inline void revcomp_inplace(std::string &s) { std::transform(s.begin(), s.end(), s.begin(), complement); std::reverse(s.begin(), s.end()); } -inline void -revcomp_inplace(std::string::iterator first, std::string::iterator last) { +inline void revcomp_inplace(std::string::iterator first, + std::string::iterator last) { std::transform(first, last, first, complement); std::reverse(first, last); } -inline std::string -bits2string_masked(size_t mask, size_t bits) { +inline std::string bits2string_masked(size_t mask, size_t bits) { std::string s; size_t selector = smithlab_bits::high_bit; for (size_t i = 0; i < smithlab_bits::word_size; ++i) { @@ -337,8 +383,7 @@ bits2string_masked(size_t mask, size_t bits) { return s; } -inline std::string -bits2string_for_positions(size_t positions, size_t bits) { +inline std::string bits2string_for_positions(size_t positions, size_t bits) { std::string s; size_t selector = smithlab_bits::high_bit; for (size_t i = 0; i < smithlab_bits::word_size; ++i) { @@ -348,35 +393,30 @@ bits2string_for_positions(size_t positions, size_t bits) { return s.substr(s.length() - positions); } -inline size_t -percent(const size_t a, const size_t b) { - return static_cast((100.0*a)/b); +inline size_t percent(const size_t a, const size_t b) { + return static_cast((100.0 * a) / b); } -inline bool -valid_base(char c) { +inline bool valid_base(char c) { char i = std::toupper(c); return (i == 'A' || i == 'C' || i == 'G' || i == 'T'); } -inline size_t -mer2index(const char *s, size_t n) { +inline size_t mer2index(const char *s, size_t n) { size_t multiplier = 1, index = 0; do { --n; - index += base2int(s[n])*multiplier; + index += base2int(s[n]) * multiplier; multiplier *= smithlab::alphabet_size; } while (n > 0); return index; } -inline size_t -kmer_counts(const std::vector &seqs, - std::vector &counts, size_t k) { +inline size_t kmer_counts(const std::vector &seqs, + std::vector &counts, size_t k) { counts.clear(); - size_t nwords = - static_cast(pow(static_cast(smithlab::alphabet_size), - static_cast(k))); + size_t nwords = static_cast( + pow(static_cast(smithlab::alphabet_size), static_cast(k))); counts.resize(nwords, 0); size_t total = 0; for (size_t i = 0; i < seqs.size(); ++i) { @@ -411,8 +451,8 @@ kmer_counts(const std::vector &seqs, */ class ProgressBar { public: - ProgressBar(const size_t x, const std::string message = "completion") : - total(x), prev(0), mid_tag(message) { + ProgressBar(const size_t x, const std::string message = "completion") + : total(x), prev(0), mid_tag(message) { // the 3 below is for the default left_tag and right_tag printed // width and the 5 is for the width of the percent (up to 100) // plus two pipes ('|') @@ -420,13 +460,11 @@ class ProgressBar { bar = std::string(bar_width, ' '); } bool time_to_report(const size_t i) const { - return std::round((100.0*std::min(i, total))/total) > prev; + return std::round((100.0 * std::min(i, total)) / total) > prev; } - void - report(std::ostream &out, const size_t i); + void report(std::ostream &out, const size_t i); private: - size_t total{}; size_t prev{}; size_t bar_width{}; diff --git a/zlib_wrapper.cpp b/zlib_wrapper.cpp index 109a2e3..7957f2d 100644 --- a/zlib_wrapper.cpp +++ b/zlib_wrapper.cpp @@ -19,16 +19,14 @@ using std::string; -char -igzfstream::peek() { +char igzfstream::peek() { const char c = gzgetc(fileobj); if (!gzeof(fileobj)) gzungetc(c, fileobj); return c; } -igzfstream& -getline(igzfstream &in, string &line) { +igzfstream &getline(igzfstream &in, string &line) { if (gzgets(in.fileobj, &in.buf[0], in.chunk_size)) { auto eol = std::find(begin(in.buf), end(in.buf), '\n'); line.clear(); @@ -38,19 +36,16 @@ getline(igzfstream &in, string &line) { return in; } -igzfstream& -operator>>(igzfstream &in, string &line) { +igzfstream &operator>>(igzfstream &in, string &line) { return getline(in, line); } -ogzfstream& -operator<<(ogzfstream &out, const string &line) { +ogzfstream &operator<<(ogzfstream &out, const string &line) { gzputs(out.fileobj, line.c_str()); return out; } -ogzfstream& -operator<<(ogzfstream &out, const char c) { +ogzfstream &operator<<(ogzfstream &out, const char c) { gzputc(out.fileobj, c); return out; } diff --git a/zlib_wrapper.hpp b/zlib_wrapper.hpp index 8ce75ac..bdbf5cd 100644 --- a/zlib_wrapper.hpp +++ b/zlib_wrapper.hpp @@ -16,22 +16,22 @@ #ifndef ZLIB_WRAPPER_HPP #define ZLIB_WRAPPER_HPP -#include #include +#include // ADS: need some way to check if this is installed #include struct igzfstream { - igzfstream(const std::string filename) : - fileobj(gzopen(filename.c_str(), "r")) { - chunk_size = 16 * 1024; - // gz internal buffer has default size of 8k - gzbuffer(fileobj, chunk_size); - buf.resize(chunk_size, ' '); - } - ~igzfstream() {gzclose_r(fileobj);} - operator bool() const {return (fileobj != NULL) && !gzeof(fileobj);} + igzfstream(const std::string filename) + : fileobj(gzopen(filename.c_str(), "r")) { + chunk_size = 16 * 1024; + // gz internal buffer has default size of 8k + gzbuffer(fileobj, chunk_size); + buf.resize(chunk_size, ' '); + } + ~igzfstream() { gzclose_r(fileobj); } + operator bool() const { return (fileobj != NULL) && !gzeof(fileobj); } char peek(); size_t chunk_size; @@ -39,29 +39,24 @@ struct igzfstream { gzFile fileobj; }; -igzfstream& -getline(igzfstream &in, std::string &line); - -igzfstream& -operator>>(igzfstream &in, std::string &line); +igzfstream &getline(igzfstream &in, std::string &line); +igzfstream &operator>>(igzfstream &in, std::string &line); struct ogzfstream { - ogzfstream(const std::string filename) : - fileobj(gzopen(filename.c_str(), "w")) {} - ~ogzfstream() {gzclose_w(fileobj);} + ogzfstream(const std::string filename) + : fileobj(gzopen(filename.c_str(), "w")) {} + ~ogzfstream() { gzclose_w(fileobj); } /* Below: eof is not a good error indicator for output streams and * we often use this only when checking after opening a file. This * function needs improving */ - operator bool() const {return fileobj != NULL;} + operator bool() const { return fileobj != NULL; } gzFile fileobj; }; -ogzfstream& -operator<<(ogzfstream &out, const std::string &line); +ogzfstream &operator<<(ogzfstream &out, const std::string &line); -ogzfstream& -operator<<(ogzfstream &out, const char c); +ogzfstream &operator<<(ogzfstream &out, const char c); #endif From d99fff90e310e69169502f230c8208809719169e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 23 Jun 2024 10:16:24 -0700 Subject: [PATCH 3/3] Fixing bug in clang-format workflow --- .github/workflows/source-format-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/source-format-check.yml b/.github/workflows/source-format-check.yml index 8417af0..c604dd0 100644 --- a/.github/workflows/source-format-check.yml +++ b/.github/workflows/source-format-check.yml @@ -17,4 +17,4 @@ jobs: run: sudo apt-get install -y clang-format - name: Run clang-format run: | - git $(ls-files '*.*pp') | xargs clang-format -n -Werror + clang-format -n -Werror $(git ls-files '*.*pp')