From c6ecbec589675ea6b90d9293ad291f8c6b11d452 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 19 Oct 2024 18:00:41 -0700 Subject: [PATCH] src/common/bam_record_utils.cpp: added a throw where an exception was created without being thrown --- src/common/bam_record_utils.cpp | 170 ++++++++++++++++++++------------ 1 file changed, 105 insertions(+), 65 deletions(-) diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp index 1f0b171c..ec92f811 100644 --- a/src/common/bam_record_utils.cpp +++ b/src/common/bam_record_utils.cpp @@ -122,7 +122,8 @@ sam_realloc_bam_data(bam1_t *b, size_t desired) { bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); } } - if (!new_data) return -1; + if (!new_data) + return -1; b->data = new_data; b->m_data = new_m_data; return 0; @@ -267,21 +268,25 @@ to_insertion(const uint32_t x) { static inline void fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) { - if (n_cigar < 3) return; + if (n_cigar < 3) + return; // find first non-softclip auto c_beg = cigar; auto c_end = cigar + n_cigar; while (!cigar_eats_ref(*c_beg) && ++c_beg != c_end) ; - if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); + if (c_beg == c_end) + throw dnmt_error("cigar eats no ref"); while (!cigar_eats_ref(*(c_end - 1)) && --c_end != c_beg) ; - if (c_beg == c_end) throw dnmt_error("cigar eats no ref"); + if (c_beg == c_end) + throw dnmt_error("cigar eats no ref"); for (auto c_itr = c_beg; c_itr != c_end; ++c_itr) - if (bam_cigar_op(*c_itr) == BAM_CSOFT_CLIP) *c_itr = to_insertion(*c_itr); + if (bam_cigar_op(*c_itr) == BAM_CSOFT_CLIP) + *c_itr = to_insertion(*c_itr); } static inline uint32_t @@ -291,7 +296,8 @@ to_softclip(const uint32_t x) { static inline void fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { - if (n_cigar < 2) return; + if (n_cigar < 2) + return; auto c_itr = cigar; const auto c_end = c_itr + n_cigar; @@ -299,7 +305,8 @@ fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { for (; !cigar_eats_ref(*c_itr) && c_itr != c_end; ++c_itr) *c_itr = to_softclip(*c_itr); - if (c_itr == c_end) throw dnmt_error("cigar eats no ref"); + if (c_itr == c_end) + throw dnmt_error("cigar eats no ref"); c_itr = cigar + n_cigar - 1; for (; !cigar_eats_ref(*c_itr) && c_itr != cigar; --c_itr) @@ -308,7 +315,8 @@ fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { static inline size_t merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) { - if (n_cigar < 2) return n_cigar; + if (n_cigar < 2) + return n_cigar; auto c_itr1 = cigar; auto c_end = c_itr1 + n_cigar; auto c_itr2 = c_itr1 + 1; @@ -404,7 +412,8 @@ get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops, uint32_t i = 0; for (i = 0; i < in_ops; ++i) { if (cigar_eats_ref(cig_in[i])) { - if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full) break; + if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full) + break; rlen += bam_cigar_oplen(cig_in[i]); } } @@ -421,7 +430,8 @@ get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops, * output of "A-" is "-T", and the output of "C-" is "-G", and so * forth. The user must handle this case separately. */ -const uint8_t byte_revcom_table[] = { +const uint8_t byte_revcomp_table[] = { + // clang-format off 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248, 4, 132, 68, 0, 36, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 244, 0, 0, 0, 0, 0, 0, @@ -436,28 +446,31 @@ const uint8_t byte_revcom_table[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, - 0, 0, 0, 255}; + 0, 0, 0, 255 + // clang-format on +}; static inline void -revcom_byte_then_reverse(unsigned char *a, unsigned char *b) { - unsigned char *p1, *p2; - for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) { - *p1 = byte_revcom_table[*p1]; - *p2 = byte_revcom_table[*p2]; +revcomp_byte_then_reverse(unsigned char *const a, unsigned char *const b) { + unsigned char *p1{a}, *p2{b}; + for (p2 -= 1; p2 > p1; ++p1, --p2) { + *p1 = byte_revcomp_table[*p1]; + *p2 = byte_revcomp_table[*p2]; *p1 ^= *p2; *p2 ^= *p1; *p1 ^= *p2; } - if (p1 == p2) *p1 = byte_revcom_table[*p1]; + if (p1 == p2) + *p1 = byte_revcomp_table[*p1]; } static inline void -revcomp_seq_by_byte(bam1_t *aln) { +revcomp_seq_by_byte(bam1_t *const aln) { const size_t l_qseq = get_l_qseq(aln); auto seq = bam_get_seq(aln); - const size_t num_bytes = ceil(l_qseq / 2.0); + const size_t num_bytes = (l_qseq + 1) / 2; // integer ceil / 2 auto seq_end = seq + num_bytes; - revcom_byte_then_reverse(seq, seq_end); + revcomp_byte_then_reverse(seq, seq_end); if (l_qseq % 2 == 1) { // for odd-length sequences for (size_t i = 0; i < num_bytes - 1; i++) { // swap 4-bit chunks within consecutive bytes like this: @@ -475,7 +488,7 @@ revcomp_seq_by_byte(bam1_t *aln) { // has been set to ceil((a_used_len + b_seq_len) / 2.0) where // a_used_len = c_seq_len - b_seq_len static inline void -merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) { +merge_by_byte(bam1_t const *const a, bam1_t const *const b, bam1_t *const c) { // ADS: (todo) need some functions for int_ceil and is_odd const size_t b_seq_len = get_l_qseq(b); const size_t c_seq_len = get_l_qseq(c); @@ -485,8 +498,8 @@ merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) { const bool is_b_odd = b_seq_len % 2 == 1; const bool is_c_odd = c_seq_len % 2 == 1; - const size_t a_num_bytes = ceil(a_used_len / 2.0); - const size_t b_num_bytes = ceil(b_seq_len / 2.0); + const size_t a_num_bytes = (a_used_len + 1) / 2; // integer ceil / 2 + const size_t b_num_bytes = (b_seq_len + 1) / 2; // integer ceil / 2 const size_t b_offset = is_a_odd && is_b_odd; @@ -500,25 +513,25 @@ merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) { // or [ aa aa aa a- ] c_seq[a_num_bytes - 1] &= 0xf0; c_seq[a_num_bytes - 1] |= - is_b_odd ? byte_revcom_table[b_seq[b_num_bytes - 1]] - : byte_revcom_table[b_seq[b_num_bytes - 1]] >> 4; + is_b_odd ? byte_revcomp_table[b_seq[b_num_bytes - 1]] + : byte_revcomp_table[b_seq[b_num_bytes - 1]] >> 4; } if (is_c_odd) { // c_seq looks like either [ aa aa aa aa ] // or [ aa aa aa ab ] for (size_t i = 0; i < b_num_bytes - 1; i++) { c_seq[a_num_bytes + i] = - (byte_revcom_table[b_seq[b_num_bytes - i - 1]] << 4) | - (byte_revcom_table[b_seq[b_num_bytes - i - 2]] >> 4); + (byte_revcomp_table[b_seq[b_num_bytes - i - 1]] << 4) | + (byte_revcomp_table[b_seq[b_num_bytes - i - 2]] >> 4); } - c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcom_table[b_seq[0]] << 4; + c_seq[a_num_bytes + b_num_bytes - 1] = byte_revcomp_table[b_seq[0]] << 4; // Here, c_seq is either [ aa aa aa aa bb bb bb b- ] (a even; b odd) // or [ aa aa aa ab bb bb bb b- ] (a odd; b odd) } else { for (size_t i = 0; i < b_num_bytes - b_offset; i++) { c_seq[a_num_bytes + i] = - byte_revcom_table[b_seq[b_num_bytes - i - 1 - b_offset]]; + byte_revcomp_table[b_seq[b_num_bytes - i - 1 - b_offset]]; } // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even) // or [ aa aa aa ab bb bb bb ] (a odd and b odd) @@ -533,7 +546,8 @@ flip_conversion(bam1_t *aln) { // ADS: don't like *(cv + 1) below, but no HTSlib function for it? auto cv = bam_aux_get(aln, "CV"); - if (!cv) throw dnmt_error("bam_aux_get failed for CV"); + if (!cv) + throw dnmt_error("bam_aux_get failed for CV"); *(cv + 1) = 'T'; } @@ -542,15 +556,15 @@ flip_conversion(bam_rec &aln) { flip_conversion(aln.b); } -// static inline bool -// are_mates(const bam1_t *one, const bam1_t *two) { -// return one->core.mtid == two->core.tid && one->core.mpos == two->core.pos && -// (one->core.flag & BAM_FREVERSE) != (one->core.flag & BAM_FREVERSE); -// // below is a consistency check and should not be necessary -// /* && -// two->core.mtid == one->core.tid && -// two->core.mpos == one->core.pos; */ -// } +[[maybe_unused]] static inline bool +are_mates(const bam1_t *one, const bam1_t *two) { + return one->core.mtid == two->core.tid && one->core.mpos == two->core.pos && + (one->core.flag & BAM_FREVERSE) != (one->core.flag & BAM_FREVERSE); + // below is a consistency check and should not be necessary + /* && + two->core.mtid == one->core.tid && + two->core.mpos == one->core.pos; */ +} static inline int truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { @@ -593,7 +607,8 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { isize, // rlen from new cigar c_seq_len, // truncated seq length 8); // enough for the 2 tags? - if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper"); + if (ret < 0) + throw dnmt_error(ret, "bam_set1_wrapper"); const size_t n_bytes_to_copy = (c_seq_len + 1) / 2; // compression std::copy_n(bam_get_seq(a), n_bytes_to_copy, bam_get_seq(c)); @@ -602,19 +617,22 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here! // "_udpate" for "int" because it determines the right size ret = bam_aux_update_int(c, "NM", nm); - if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int"); + if (ret < 0) + throw dnmt_error(ret, "bam_aux_update_int"); const uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV")); // "_append" for "char" because there is no corresponding update ret = bam_aux_append(c, "CV", 'A', 1, &conversion); - if (ret < 0) throw dnmt_error(ret, "bam_aux_append"); + if (ret < 0) + throw dnmt_error(ret, "bam_aux_append"); return ret; } int truncate_overlap(const bam_rec &a, const uint32_t overlap, bam_rec &c) { - if (c.b == nullptr) c.b = bam_init1(); + if (c.b == nullptr) + c.b = bam_init1(); return truncate_overlap(a.b, overlap, c.b); } @@ -689,7 +707,8 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, isize, // updated c_seq_len, // merged sequence length 8); // enough for 2 tags? - if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper in merge_overlap"); + if (ret < 0) + throw dnmt_error(ret, "bam_set1_wrapper in merge_overlap"); // Merge the sequences by bytes merge_by_byte(a, b, c); @@ -697,12 +716,14 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, const int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + bam_aux2i(bam_aux_get(b, "NM"))); ret = bam_aux_update_int(c, "NM", nm); - if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int in merge_overlap"); + if (ret < 0) + throw dnmt_error(ret, "bam_aux_update_int in merge_overlap"); // add the tag for conversion const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); ret = bam_aux_append(c, "CV", 'A', 1, &cv); - if (ret < 0) throw dnmt_error(ret, "bam_aux_append in merge_overlap"); + if (ret < 0) + throw dnmt_error(ret, "bam_aux_append in merge_overlap"); return ret; } @@ -710,7 +731,8 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, int merge_overlap(const bam_rec &a, const bam_rec &b, const uint32_t head, bam_rec &c) { - if (c.b == nullptr) c.b = bam_init1(); + if (c.b == nullptr) + c.b = bam_init1(); return merge_overlap(a.b, b.b, head, c.b); } @@ -756,7 +778,8 @@ merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, isize, // TLEN (relative to reference; SAM docs) c_seq_len, // merged sequence length 8); // enough for 2 tags of 1 byte value? - if (ret < 0) throw dnmt_error(ret, "bam_set1 in merge_non_overlap"); + if (ret < 0) + throw dnmt_error(ret, "bam_set1 in merge_non_overlap"); merge_by_byte(a, b, c); @@ -765,12 +788,14 @@ merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, (bam_aux2i(bam_aux_get(a, "NM")) + bam_aux2i(bam_aux_get(b, "NM"))); // "udpate" for "int" because it determines the right size ret = bam_aux_update_int(c, "NM", nm); - if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int"); + if (ret < 0) + throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int"); const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); // "append" for "char" because there is no corresponding update ret = bam_aux_append(c, "CV", 'A', 1, &cv); - if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_append"); + if (ret < 0) + throw dnmt_error(ret, "merge_non_overlap:bam_aux_append"); return ret; } @@ -778,7 +803,8 @@ merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, int merge_non_overlap(const bam_rec &a, const bam_rec &b, const uint32_t spacer, bam_rec &c) { - if (c.b == nullptr) c.b = bam_init1(); + if (c.b == nullptr) + c.b = bam_init1(); return merge_non_overlap(a.b, b.b, spacer, c.b); } @@ -797,30 +823,35 @@ keep_better_end(const bam1_t *a, const bam1_t *b, bam1_t *c) { int keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c) { - if (c.b == nullptr) c.b = bam_init1(); + if (c.b == nullptr) + c.b = bam_init1(); return keep_better_end(a.b, b.b, c.b); } // ADS: will move to using this function once it is written static inline void standardize_format(const string &input_format, bam1_t *aln) { - int err_code = 0; + int err_code; // = 0; - if (input_format == "abismal" || input_format == "walt") return; + if (input_format == "abismal" || input_format == "walt") + return; if (input_format == "bsmap") { // A/T rich: get the ZS tag value const auto zs_tag = bam_aux_get(aln, "ZS"); - if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)"); + if (!zs_tag) + throw dnmt_error("bam_aux_get for ZS (invalid bsmap)"); // ADS: test for errors on the line below const auto zs_tag_value = string(bam_aux2Z(zs_tag)); - if (zs_tag_value.empty()) throw dnmt_error("empty ZS tag in bsmap format"); + if (zs_tag_value.empty()) + throw dnmt_error("empty ZS tag in bsmap format"); if (zs_tag_value[0] != '-' && zs_tag_value[0] != '+') throw dnmt_error("invalid ZS tag in bsmap format"); const uint8_t cv = zs_tag_value[1] == '-' ? 'A' : 'T'; // get the "mismatches" tag const auto nm_tag = bam_aux_get(aln, "NM"); - if (!nm_tag) throw dnmt_error("invalid NM tag in bsmap format"); + if (!nm_tag) + throw dnmt_error("invalid NM tag in bsmap format"); const int64_t nm = bam_aux2i(nm_tag); // ADS: this should delete the aux data by truncating the used @@ -840,7 +871,8 @@ standardize_format(const string &input_format, bam1_t *aln) { throw dnmt_error(err_code, "error setting conversion in bsmap format"); // reverse complement if needed - if (bam_is_rev(aln)) revcomp_seq_by_byte(aln); + if (bam_is_rev(aln)) + revcomp_seq_by_byte(aln); } else if (input_format == "bismark") { // ADS: Previously we modified the read names at the first @@ -849,11 +881,13 @@ standardize_format(const string &input_format, bam1_t *aln) { // A/T rich; get the XR tag value auto xr_tag = bam_aux_get(aln, "XR"); - if (!xr_tag) throw dnmt_error("bam_aux_get for XR (invalid bismark)"); + if (!xr_tag) + throw dnmt_error("bam_aux_get for XR (invalid bismark)"); const uint8_t cv = string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T'; // get the "mismatches" tag auto nm_tag = bam_aux_get(aln, "NM"); - if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bismark)"); + if (!nm_tag) + throw dnmt_error("bam_aux_get for NM (invalid bismark)"); const int64_t nm = bam_aux2i(nm_tag); aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize) @@ -862,17 +896,20 @@ standardize_format(const string &input_format, bam1_t *aln) { // "udpate" for "int" because it determines the right size; even // though we just deleted all tags, it will add it back here. err_code = bam_aux_update_int(aln, "NM", nm); - if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int"); + if (err_code < 0) + throw dnmt_error(err_code, "bam_aux_update_int"); // "append" for "char" because there is no corresponding update err_code = bam_aux_append(aln, "CV", 'A', 1, &cv); - if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append"); + if (err_code < 0) + throw dnmt_error(err_code, "bam_aux_append"); if (bam_is_rev(aln)) revcomp_seq_by_byte(aln); // reverse complement if needed } // ADS: the condition below should be checked much earlier, ideally // before the output file is created - else throw runtime_error("incorrect format specified: " + input_format); + else + throw runtime_error("incorrect format specified: " + input_format); // Be sure this doesn't depend on mapper! Removes the "qual" part of // the data in a bam1_t struct but does not change its uncompressed @@ -940,7 +977,10 @@ string to_string(const bam_header &hdr, const bam_rec &aln) { kstring_t ks = {0, 0, nullptr}; int ret = sam_format1(hdr.h, aln.b, &ks); - if (ret < 0) { runtime_error("Can't format record: " + to_string(hdr, aln)); } - if (ks.s != nullptr) free(ks.s); + if (ret < 0) { + throw runtime_error("Can't format record: " + to_string(hdr, aln)); + } + if (ks.s != nullptr) + free(ks.s); return string(ks.s); }