Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compress counts #200

Merged
merged 13 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ dnmtools_SOURCES += src/utils/fast-liftover.cpp
dnmtools_SOURCES += src/utils/covered.cpp
dnmtools_SOURCES += src/utils/recovered.cpp
dnmtools_SOURCES += src/utils/kmersites.cpp
dnmtools_SOURCES += src/utils/xcounts.cpp
dnmtools_SOURCES += src/utils/unxcounts.cpp

dnmtools_SOURCES += src/amrfinder/allelicmeth.cpp
dnmtools_SOURCES += src/amrfinder/amrfinder.cpp
Expand Down
57 changes: 38 additions & 19 deletions src/analysis/methcounts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ using std::unordered_set;
using std::unordered_map;

using bamxx::bam_rec;
using bamxx::bam_header;
using bamxx::bgzf_file;

struct quick_buf : public std::ostringstream,
public std::basic_stringbuf<char> {
Expand Down Expand Up @@ -218,7 +220,7 @@ tag_with_mut(const uint32_t tag, const bool mut) {

template <const bool require_covered = false>
static void
write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out,
write_output(const bam_header &hdr, bgzf_file &out,
const int32_t tid, const string &chrom,
const vector<CountSet> &counts, bool CPG_ONLY) {

Expand Down Expand Up @@ -322,7 +324,7 @@ count_states_neg(const bam_rec &aln, vector<CountSet> &counts) {


static unordered_map<int32_t, size_t>
get_tid_to_idx(const bamxx::bam_header &hdr,
get_tid_to_idx(const bam_header &hdr,
const unordered_map<string, size_t> name_to_idx) {
unordered_map<int32_t, size_t> tid_to_idx;
for (int32_t i = 0; i < hdr.h->n_targets; ++i) {
Expand All @@ -342,15 +344,15 @@ template <class CS>
static void
output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid,
const unordered_map<int32_t, size_t> &tid_to_idx,
const bamxx::bam_header &hdr,
const bam_header &hdr,
const vector<string>::const_iterator chroms_beg,
const vector<size_t> &chrom_sizes,
vector<CS> &counts, bamxx::bgzf_file &out) {
vector<CS> &counts, bgzf_file &out) {

// get the index of the next chrom sequence
const auto chrom_idx = tid_to_idx.find(tid);
if (chrom_idx == cend(tid_to_idx))
throw dnmt_error("chrom not found: " + string(sam_hdr_tid2name(hdr, tid)));
throw dnmt_error("chrom not found: " + sam_hdr_tid2name(hdr, tid));

const auto chrom_itr = chroms_beg + chrom_idx->second;

Expand All @@ -363,15 +365,15 @@ output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid,


static bool
consistent_targets(const bamxx::bam_header &hdr,
consistent_targets(const bam_header &hdr,
const unordered_map<int32_t, size_t> &tid_to_idx,
const vector<string> &names, const vector<size_t> &sizes) {
const size_t n_targets = hdr.h->n_targets;
if (n_targets != names.size()) return false;

for (size_t tid = 0; tid < n_targets; ++tid) {
const string tid_name_sam = sam_hdr_tid2name(hdr.h, tid);
const size_t tid_size_sam = sam_hdr_tid2len(hdr.h, tid);
const string tid_name_sam = sam_hdr_tid2name(hdr, tid);
const size_t tid_size_sam = sam_hdr_tid2len(hdr, tid);
const auto idx_itr = tid_to_idx.find(tid);
if (idx_itr == cend(tid_to_idx)) return false;
const auto idx = idx_itr->second;
Expand All @@ -380,13 +382,24 @@ consistent_targets(const bamxx::bam_header &hdr,
return true;
}


template<const bool require_covered = false>
static void
write_header(const bam_header &hdr, bgzf_file &out) {
for (auto i = 0; i < hdr.h->n_targets; ++i) {
const size_t tid_size = sam_hdr_tid2len(hdr, i);
const string tid_name = sam_hdr_tid2name(hdr, i);
std::ostringstream oss;
oss << "# " << tid_name << ' ' << tid_size << '\n';
out.write(oss.str());
}
out.write("# end_header\n");
}

template<const bool require_covered = false> static void
process_reads(const bool VERBOSE, const bool show_progress,
const bool compress_output, const size_t n_threads,
const string &infile, const string &outfile,
const string &chroms_file, const bool CPG_ONLY) {
const bool compress_output, const bool include_header,
const size_t n_threads, const string &infile,
const string &outfile, const string &chroms_file,
const bool CPG_ONLY) {
// first get the chromosome names and sequences from the FASTA file
vector<string> chroms, names;
read_fasta_file_short_names(chroms_file, names, chroms);
Expand All @@ -410,7 +423,7 @@ process_reads(const bool VERBOSE, const bool show_progress,
bamxx::bam_in hts(infile);
if (!hts) throw dnmt_error("failed to open input file");
// load the input file's header
bamxx::bam_header hdr(hts);
bam_header hdr(hts);
if (!hdr) throw dnmt_error("failed to read header");

unordered_map<int32_t, size_t> tid_to_idx = get_tid_to_idx(hdr, name_to_idx);
Expand All @@ -420,7 +433,7 @@ process_reads(const bool VERBOSE, const bool show_progress,

// open the output file
const string output_mode = compress_output ? "w" : "wu";
bamxx::bgzf_file out(outfile, output_mode);
bgzf_file out(outfile, output_mode);
if (!out) throw dnmt_error("error opening output file: " + outfile);

// set the threads for the input file decompression
Expand All @@ -429,6 +442,9 @@ process_reads(const bool VERBOSE, const bool show_progress,
tp.set_io(out);
}

if (include_header)
write_header(hdr, out);

// now iterate over the reads, switching chromosomes and writing
// output as needed
bam_rec aln;
Expand Down Expand Up @@ -499,6 +515,7 @@ main_counts(int argc, const char **argv) {
bool CPG_ONLY = false;
bool require_covered = false;
bool compress_output = false;
bool include_header = false;

string chroms_file;
string outfile;
Expand All @@ -519,6 +536,8 @@ main_counts(int argc, const char **argv) {
false, CPG_ONLY);
opt_parse.add_opt("require-covered", 'r', "only output covered sites",
false, require_covered);
opt_parse.add_opt("header", 'H', "add a header to identify the reference",
false, include_header);
opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output);
opt_parse.add_opt("progress", '\0', "show progress", false, show_progress);
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
Expand Down Expand Up @@ -559,12 +578,12 @@ main_counts(int argc, const char **argv) {

if (require_covered)
process_reads<true>(VERBOSE, show_progress, compress_output,
n_threads, mapped_reads_file, outfile,
include_header, n_threads, mapped_reads_file, outfile,
chroms_file, CPG_ONLY);
else
process_reads(VERBOSE, show_progress, compress_output,
n_threads, mapped_reads_file, outfile,
chroms_file, CPG_ONLY);
process_reads(VERBOSE, show_progress, compress_output, include_header,
n_threads, mapped_reads_file, outfile, chroms_file,
CPG_ONLY);
}
catch (const std::exception &e) {
cerr << e.what() << endl;
Expand Down
26 changes: 18 additions & 8 deletions src/common/MSite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,14 @@ using std::cbegin;
using std::cend;
using std::end;

MSite::MSite(const string &line) {

bool
MSite::initialize(const char *c, const char *c_end) {
constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };

bool failed = false;

const auto c = line.data();
const auto c_end = c + line.size();

auto field_s = c;
auto field_e = find_if(field_s + 1, c_end, is_sep);
if (field_e == c_end) failed = true;
Expand Down Expand Up @@ -97,14 +96,25 @@ MSite::MSite(const string &line) {
const auto [ptr, ec] = from_chars(field_s, c_end, n_reads);
failed = failed || (ptr != c_end);
}
// ADS: the value below would work for a flag
// pos = std::numeric_limits<decltype(pos)>::max();

return !failed;
}


if (failed) {
MSite::MSite(const string &line) {
if (!initialize(line.data(), line.data() + size(line)))
throw runtime_error("bad count line: " + line);
// ADS: the value below would work for a flag
// pos = std::numeric_limits<decltype(pos)>::max();
}
}


MSite::MSite(const char *line, const int n) {
if (!initialize(line, line + n))
throw runtime_error("bad count line: " + string(line));
}


string
MSite::tostring() const {
std::ostringstream oss;
Expand Down
3 changes: 3 additions & 0 deletions src/common/MSite.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ struct MSite {
chrom(_chrom), pos(_pos), strand(_strand),
context(_context), meth(_meth), n_reads(_n_reads) {}
explicit MSite(const std::string &line);
explicit MSite(const char *line, const int n);

bool initialize(const char *c, const char *c_end);

std::string chrom;
size_t pos;
Expand Down
5 changes: 5 additions & 0 deletions src/common/bam_record_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ sam_hdr_tid2name(const bamxx::bam_header &hdr, const int32_t tid) {
return std::string(sam_hdr_tid2name(hdr.h, tid));
}

inline uint32_t
sam_hdr_tid2len(const bamxx::bam_header &hdr, const int32_t tid) {
return sam_hdr_tid2len(hdr.h, tid);
}

inline std::string
sam_hdr_tid2name(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln) {
return std::string(sam_hdr_tid2name(hdr.h, aln.b->core.tid));
Expand Down
6 changes: 6 additions & 0 deletions src/dnmtools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ metagene(int argc, const char **argv);
int
main_covered(int argc, const char **argv);
int
main_xcounts(int argc, const char **argv);
int
main_unxcounts(int argc, const char **argv);
int
main_recovered(int argc, const char **argv);
int
kmersites(int argc, const char **argv);
Expand Down Expand Up @@ -200,6 +204,8 @@ main(int argc, const char **argv) {
{"merge", "merge multiple counts files into a counts file or a table", main_merge_methcounts},
{"covered", "filter a counts file for only covered sites", main_covered},
{"recovered", "replace missing sites in a counts file", main_recovered},
{"xcounts", "compress counts files by removing information", main_xcounts},
{"unxcounts", "reverse the xcounts process yielding a counts file", main_unxcounts},
{"selectsites", "sites inside a set of genomic intervals", main_selectsites},
{"kmersites", "make track file for sites matching kmer", kmersites}}}}}};
// clang-format on
Expand Down
2 changes: 1 addition & 1 deletion src/utils/covered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ main_covered(int argc, const char **argv) {

/****************** COMMAND LINE OPTIONS ********************/
OptionParser opt_parse(strip_path(argv[0]), description,
"<methcounts-file> (\"-\" for standard input)", 1);
"<counts-file> (\"-\" for standard input)", 1);
opt_parse.add_opt("output", 'o', "output file (default is standard out)",
false, outfile);
opt_parse.add_opt("threads", 't', "threads for compression (use few)",
Expand Down
6 changes: 5 additions & 1 deletion src/utils/recovered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads,
int32_t prev_idx = -1;

while (getline(in, line)) {
if (line[0] == '#') continue;
line.resize(line.find_first_of(" \t"));
if (line != prev_chrom) {
if (verbose) cerr << "verifying: " << line << endl;
Expand Down Expand Up @@ -273,8 +274,11 @@ process_sites(const bool verbose, const bool add_missing_chroms,
// ADS: this is probably a poor strategy since we already would know
// the index of the chrom sequence in the vector.
chrom_itr_t chrom_itr;
string line;

while (read_site(in, site)) {
while (getline(in, line)) {
if (line[0] == '#') continue;
site.initialize(line.data(), line.data() + size(line));
if (site.chrom != chrom_name) {

if (pos != num_lim<uint64_t>::max())
Expand Down
Loading