From 7fd12c522f4c6753806804dcaa5a798b65f13dfd Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 23 Jun 2024 14:17:08 -0700 Subject: [PATCH 1/3] Adding functions to xcounts_utils for checking file format and reading xcounts format files into a concise format for an individual chrom --- src/common/xcounts_utils.cpp | 130 +++++++++++++++++++++++++++++++++++ src/common/xcounts_utils.hpp | 39 +++++++++++ 2 files changed, 169 insertions(+) create mode 100644 src/common/xcounts_utils.cpp create mode 100644 src/common/xcounts_utils.hpp diff --git a/src/common/xcounts_utils.cpp b/src/common/xcounts_utils.cpp new file mode 100644 index 0000000..b4cb0d5 --- /dev/null +++ b/src/common/xcounts_utils.cpp @@ -0,0 +1,130 @@ +/* xcounts_utils: code for doing things with xcounts format and some + * for counts format that is common to several tools. + * + * Copyright (C) 2023-2024 Andrew D. Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "xcounts_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "counts_header.hpp" +#include "bam_record_utils.hpp" + +#include "bamxx.hpp" +#include "dnmt_error.hpp" + +using std::vector; +using std::string; +using std::to_string; +using std::unordered_map; +using std::runtime_error; + +using bamxx::bgzf_file; + + +// careful: this could get big +unordered_map> +read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) { + bamxx::bam_tpool tp(n_threads); + bamxx::bgzf_file in(xcounts_file, "r"); + if (!in) throw runtime_error("failed to open input file"); + + // set the threads for the input file decompression + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); + + kstring_t line{0, 0, nullptr}; + string chrom_name; + uint64_t pos = 0; + + unordered_map> sites_by_chrom; + + vector curr_chrom; + + while (bamxx::getline(in, line)) { + if (is_counts_header_line(line.s)) continue; // ADS: early loop exit + + if (!std::isdigit(line.s[0])) { // check if we have a chrom line + if (!chrom_name.empty()) { + sites_by_chrom.insert({chrom_name, curr_chrom}); + curr_chrom.clear(); + } + chrom_name = string{line.s}; + pos = 0; + continue; + } + + uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0; + const auto end_line = line.s + line.l; + auto res = std::from_chars(line.s, end_line, pos_step); + res = std::from_chars(res.ptr + 1, end_line, n_meth); + res = std::from_chars(res.ptr + 1, end_line, n_unmeth); + + const auto curr_pos = pos + pos_step; + + curr_chrom.push_back({curr_pos, n_meth, n_unmeth}); + + pos = curr_pos; + } + + if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom}); + + return sites_by_chrom; +} + + +bool +get_is_xcounts_file(const std::string &filename) { + static constexpr auto max_lines_to_check = 1000ul; + bamxx::bgzf_file in(filename, "r"); + if (!in) throw dnmt_error{"failed to open input file: " + filename}; + + kstring_t line{0, 0, nullptr}; + + bool found_header = false; + bool found_chrom = false; + + auto line_count = 0ul; + while (line_count++ < max_lines_to_check && bamxx::getline(in, line)) { + if (is_counts_header_line(line.s)) { + found_header = true; + } + else if (!std::isdigit(line.s[0])) { // check if we have a chrom line + if (!found_header) + return false; + found_chrom = true; + } + else { + if (!found_chrom) return false; + int64_t pos_step = 0, n_meth = 0, n_unmeth = 0; + const auto end_line = line.s + line.l; + auto res = std::from_chars(line.s, end_line, pos_step); + if (res.ec != std::errc()) return false; + res = std::from_chars(res.ptr + 1, end_line, n_meth); + if (res.ec != std::errc()) return false; + res = std::from_chars(res.ptr + 1, end_line, n_unmeth); + if (res.ec != std::errc()) return false; + } + } + return true; +} diff --git a/src/common/xcounts_utils.hpp b/src/common/xcounts_utils.hpp new file mode 100644 index 0000000..b4a14a4 --- /dev/null +++ b/src/common/xcounts_utils.hpp @@ -0,0 +1,39 @@ +/* xcounts_utils: code for doing things with xcounts format and some + * for counts format that is common to several tools. + * + * Copyright (C) 2023-2024 Andrew D. Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef XCOUNTS_UTILS_HPP +#define XCOUNTS_UTILS_HPP + +#include +#include +#include +#include + +struct xcounts_entry { + uint64_t pos{}; // absolute position + uint32_t n_meth{}; + uint32_t n_unmeth{}; +}; + +std::unordered_map> +read_xcounts_by_chrom(const uint32_t n_threads, const std::string &xcounts_file); + +bool +get_is_xcounts_file(const std::string &filename); + +#endif From f4d89303b6a3193f4bf838f96204f65387bba6f5 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 23 Jun 2024 14:32:52 -0700 Subject: [PATCH 2/3] Makefile.am: Adding xcounts_utils source and header file to dependencies --- Makefile.am | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.am b/Makefile.am index fe5e1e1..a0120a3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -153,6 +153,7 @@ libdnmtools_a_SOURCES = \ src/common/dnmtools_gaussinv.cpp \ src/common/bam_record_utils.cpp \ src/common/counts_header.cpp \ + src/common/xcounts_utils.cpp \ src/common/BetaBin.cpp \ src/common/EmissionDistribution.cpp \ src/common/Epiread.cpp \ @@ -172,6 +173,7 @@ libdnmtools_a_SOURCES += \ src/common/dnmtools_gaussinv.hpp \ src/common/bam_record_utils.hpp \ src/common/counts_header.hpp \ + src/common/xcounts_utils.hpp \ src/common/BetaBin.hpp \ src/common/EmissionDistribution.hpp \ src/common/Epiread.hpp \ From 3f3e770c9c34a2209ee48ea83b55a3b4b7e007b5 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sun, 23 Jun 2024 14:33:23 -0700 Subject: [PATCH 3/3] src/analysis/cpgbins.cpp: updates for refactoring into xcounts_utils --- src/analysis/cpgbins.cpp | 77 +++++++--------------------------------- 1 file changed, 12 insertions(+), 65 deletions(-) diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp index fd49f20..8ae1fee 100644 --- a/src/analysis/cpgbins.cpp +++ b/src/analysis/cpgbins.cpp @@ -20,7 +20,7 @@ #include "MSite.hpp" #include "OptionParser.hpp" #include "bsutils.hpp" -#include "counts_header.hpp" +#include "xcounts_utils.hpp" #include "smithlab_utils.hpp" #include @@ -121,69 +121,16 @@ get_chrom_names(const string &chrom_sizes_file) { return chrom_names; } -struct xsym_entry { - uint64_t pos{}; - uint32_t n_meth{}; - uint32_t n_unmeth{}; -}; -static unordered_map> -read_xsym_by_chrom(const uint32_t n_threads, const string &xsym_file) { - bamxx::bam_tpool tp(n_threads); - bamxx::bgzf_file in(xsym_file, "r"); - if (!in) throw runtime_error("failed to open input file"); - - // set the threads for the input file decompression - if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); - - kstring_t line{0, 0, nullptr}; - string chrom_name; - uint64_t pos = 0; - - unordered_map> sites_by_chrom; - - vector curr_chrom; - - while (bamxx::getline(in, line)) { - if (is_counts_header_line(line.s)) continue; // ADS: early loop exit - - if (!std::isdigit(line.s[0])) { // check if we have a chrom line - if (!chrom_name.empty()) { - sites_by_chrom.insert({chrom_name, curr_chrom}); - curr_chrom.clear(); - } - chrom_name = string{line.s}; - pos = 0; - continue; - } - - uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0; - const auto end_line = line.s + line.l; - auto res = std::from_chars(line.s, end_line, pos_step); - res = std::from_chars(res.ptr + 1, end_line, n_meth); - res = std::from_chars(res.ptr + 1, end_line, n_unmeth); - - const auto curr_pos = pos + pos_step; - - curr_chrom.push_back({curr_pos, n_meth, n_unmeth}); - - pos = curr_pos; - } - - if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom}); - - return sites_by_chrom; -} - -void -update(LevelsCounter &lc, const xsym_entry &xse) { - const uint64_t n_reads = xse.n_meth + xse.n_unmeth; +static void +update(LevelsCounter &lc, const xcounts_entry &xce) { + const uint64_t n_reads = xce.n_meth + xce.n_unmeth; if (n_reads > 0) { ++lc.sites_covered; lc.max_depth = std::max(lc.max_depth, n_reads); - lc.total_c += xse.n_meth; - lc.total_t += xse.n_unmeth; - const auto meth = static_cast(xse.n_unmeth) / n_reads; + lc.total_c += xce.n_meth; + lc.total_t += xce.n_unmeth; + const auto meth = static_cast(xce.n_unmeth) / n_reads; lc.total_meth += meth; double lower = 0.0, upper = 0.0; wilson_ci_for_binomial(lc.alpha, n_reads, meth, lower, upper); @@ -196,7 +143,7 @@ update(LevelsCounter &lc, const xsym_entry &xse) { static void process_chrom(const bool report_more_info, const char level_code, const string &chrom_name, const uint64_t chrom_size, - const uint64_t bin_size, const vector &sites, + const uint64_t bin_size, const vector &sites, ostream &out) { GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+'); @@ -302,17 +249,17 @@ Columns (beyond the first 6) in the BED format output: return EXIT_FAILURE; } const string chrom_sizes_file = leftover_args.front(); - const string xsym_file = leftover_args.back(); + const string xcounts_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ if (!fs::is_regular_file(chrom_sizes_file)) throw runtime_error("chromosome sizes file not a regular file: " + chrom_sizes_file); - if (!fs::is_regular_file(xsym_file)) - throw runtime_error("xsym file not a regular file: " + xsym_file); + if (!fs::is_regular_file(xcounts_file)) + throw runtime_error("xsym file not a regular file: " + xcounts_file); - const auto sites_by_chrom = read_xsym_by_chrom(n_threads, xsym_file); + const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xcounts_file); const auto chrom_names = get_chrom_names(chrom_sizes_file); const auto chrom_sizes = get_chrom_sizes(chrom_sizes_file);