Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding xcounts_utils.cpp source file #234

Merged
merged 3 commits into from
Jun 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ libdnmtools_a_SOURCES = \
src/common/dnmtools_gaussinv.cpp \
src/common/bam_record_utils.cpp \
src/common/counts_header.cpp \
src/common/xcounts_utils.cpp \
src/common/BetaBin.cpp \
src/common/EmissionDistribution.cpp \
src/common/Epiread.cpp \
Expand All @@ -172,6 +173,7 @@ libdnmtools_a_SOURCES += \
src/common/dnmtools_gaussinv.hpp \
src/common/bam_record_utils.hpp \
src/common/counts_header.hpp \
src/common/xcounts_utils.hpp \
src/common/BetaBin.hpp \
src/common/EmissionDistribution.hpp \
src/common/Epiread.hpp \
Expand Down
77 changes: 12 additions & 65 deletions src/analysis/cpgbins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "MSite.hpp"
#include "OptionParser.hpp"
#include "bsutils.hpp"
#include "counts_header.hpp"
#include "xcounts_utils.hpp"
#include "smithlab_utils.hpp"

#include <bamxx.hpp>
Expand Down Expand Up @@ -121,69 +121,16 @@ get_chrom_names(const string &chrom_sizes_file) {
return chrom_names;
}

struct xsym_entry {
uint64_t pos{};
uint32_t n_meth{};
uint32_t n_unmeth{};
};

static unordered_map<string, vector<xsym_entry>>
read_xsym_by_chrom(const uint32_t n_threads, const string &xsym_file) {
bamxx::bam_tpool tp(n_threads);
bamxx::bgzf_file in(xsym_file, "r");
if (!in) throw runtime_error("failed to open input file");

// set the threads for the input file decompression
if (n_threads > 1 && in.is_bgzf()) tp.set_io(in);

kstring_t line{0, 0, nullptr};
string chrom_name;
uint64_t pos = 0;

unordered_map<string, vector<xsym_entry>> sites_by_chrom;

vector<xsym_entry> curr_chrom;

while (bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) continue; // ADS: early loop exit

if (!std::isdigit(line.s[0])) { // check if we have a chrom line
if (!chrom_name.empty()) {
sites_by_chrom.insert({chrom_name, curr_chrom});
curr_chrom.clear();
}
chrom_name = string{line.s};
pos = 0;
continue;
}

uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0;
const auto end_line = line.s + line.l;
auto res = std::from_chars(line.s, end_line, pos_step);
res = std::from_chars(res.ptr + 1, end_line, n_meth);
res = std::from_chars(res.ptr + 1, end_line, n_unmeth);

const auto curr_pos = pos + pos_step;

curr_chrom.push_back({curr_pos, n_meth, n_unmeth});

pos = curr_pos;
}

if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom});

return sites_by_chrom;
}

void
update(LevelsCounter &lc, const xsym_entry &xse) {
const uint64_t n_reads = xse.n_meth + xse.n_unmeth;
static void
update(LevelsCounter &lc, const xcounts_entry &xce) {
const uint64_t n_reads = xce.n_meth + xce.n_unmeth;
if (n_reads > 0) {
++lc.sites_covered;
lc.max_depth = std::max(lc.max_depth, n_reads);
lc.total_c += xse.n_meth;
lc.total_t += xse.n_unmeth;
const auto meth = static_cast<double>(xse.n_unmeth) / n_reads;
lc.total_c += xce.n_meth;
lc.total_t += xce.n_unmeth;
const auto meth = static_cast<double>(xce.n_unmeth) / n_reads;
lc.total_meth += meth;
double lower = 0.0, upper = 0.0;
wilson_ci_for_binomial(lc.alpha, n_reads, meth, lower, upper);
Expand All @@ -196,7 +143,7 @@ update(LevelsCounter &lc, const xsym_entry &xse) {
static void
process_chrom(const bool report_more_info, const char level_code,
const string &chrom_name, const uint64_t chrom_size,
const uint64_t bin_size, const vector<xsym_entry> &sites,
const uint64_t bin_size, const vector<xcounts_entry> &sites,
ostream &out) {
GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+');

Expand Down Expand Up @@ -302,17 +249,17 @@ Columns (beyond the first 6) in the BED format output:
return EXIT_FAILURE;
}
const string chrom_sizes_file = leftover_args.front();
const string xsym_file = leftover_args.back();
const string xcounts_file = leftover_args.back();
/****************** END COMMAND LINE OPTIONS *****************/

if (!fs::is_regular_file(chrom_sizes_file))
throw runtime_error("chromosome sizes file not a regular file: " +
chrom_sizes_file);

if (!fs::is_regular_file(xsym_file))
throw runtime_error("xsym file not a regular file: " + xsym_file);
if (!fs::is_regular_file(xcounts_file))
throw runtime_error("xsym file not a regular file: " + xcounts_file);

const auto sites_by_chrom = read_xsym_by_chrom(n_threads, xsym_file);
const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xcounts_file);
const auto chrom_names = get_chrom_names(chrom_sizes_file);
const auto chrom_sizes = get_chrom_sizes(chrom_sizes_file);

Expand Down
130 changes: 130 additions & 0 deletions src/common/xcounts_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/* xcounts_utils: code for doing things with xcounts format and some
* for counts format that is common to several tools.
*
* Copyright (C) 2023-2024 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

#include "xcounts_utils.hpp"

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cassert>
#include <cstdint>
#include <sstream>
#include <charconv>
#include <unordered_map>

#include "counts_header.hpp"
#include "bam_record_utils.hpp"

#include "bamxx.hpp"
#include "dnmt_error.hpp"

using std::vector;
using std::string;
using std::to_string;
using std::unordered_map;
using std::runtime_error;

using bamxx::bgzf_file;


// careful: this could get big
unordered_map<string, vector<xcounts_entry>>
read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) {
bamxx::bam_tpool tp(n_threads);
bamxx::bgzf_file in(xcounts_file, "r");
if (!in) throw runtime_error("failed to open input file");

// set the threads for the input file decompression
if (n_threads > 1 && in.is_bgzf()) tp.set_io(in);

kstring_t line{0, 0, nullptr};
string chrom_name;
uint64_t pos = 0;

unordered_map<string, vector<xcounts_entry>> sites_by_chrom;

vector<xcounts_entry> curr_chrom;

while (bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) continue; // ADS: early loop exit

if (!std::isdigit(line.s[0])) { // check if we have a chrom line
if (!chrom_name.empty()) {
sites_by_chrom.insert({chrom_name, curr_chrom});
curr_chrom.clear();
}
chrom_name = string{line.s};
pos = 0;
continue;
}

uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0;
const auto end_line = line.s + line.l;
auto res = std::from_chars(line.s, end_line, pos_step);
res = std::from_chars(res.ptr + 1, end_line, n_meth);
res = std::from_chars(res.ptr + 1, end_line, n_unmeth);

const auto curr_pos = pos + pos_step;

curr_chrom.push_back({curr_pos, n_meth, n_unmeth});

pos = curr_pos;
}

if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom});

return sites_by_chrom;
}


bool
get_is_xcounts_file(const std::string &filename) {
static constexpr auto max_lines_to_check = 1000ul;
bamxx::bgzf_file in(filename, "r");
if (!in) throw dnmt_error{"failed to open input file: " + filename};

kstring_t line{0, 0, nullptr};

bool found_header = false;
bool found_chrom = false;

auto line_count = 0ul;
while (line_count++ < max_lines_to_check && bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) {
found_header = true;
}
else if (!std::isdigit(line.s[0])) { // check if we have a chrom line
if (!found_header)
return false;
found_chrom = true;
}
else {
if (!found_chrom) return false;
int64_t pos_step = 0, n_meth = 0, n_unmeth = 0;
const auto end_line = line.s + line.l;
auto res = std::from_chars(line.s, end_line, pos_step);
if (res.ec != std::errc()) return false;
res = std::from_chars(res.ptr + 1, end_line, n_meth);
if (res.ec != std::errc()) return false;
res = std::from_chars(res.ptr + 1, end_line, n_unmeth);
if (res.ec != std::errc()) return false;
}
}
return true;
}
39 changes: 39 additions & 0 deletions src/common/xcounts_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/* xcounts_utils: code for doing things with xcounts format and some
* for counts format that is common to several tools.
*
* Copyright (C) 2023-2024 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

#ifndef XCOUNTS_UTILS_HPP
#define XCOUNTS_UTILS_HPP

#include <string>
#include <vector>
#include <cstdint>
#include <unordered_map>

struct xcounts_entry {
uint64_t pos{}; // absolute position
uint32_t n_meth{};
uint32_t n_unmeth{};
};

std::unordered_map<std::string, std::vector<xcounts_entry>>
read_xcounts_by_chrom(const uint32_t n_threads, const std::string &xcounts_file);

bool
get_is_xcounts_file(const std::string &filename);

#endif
Loading