Skip to content

Commit

Permalink
Merge pull request #200 from smithlabcode/compress-counts
Browse files Browse the repository at this point in the history
compress counts
  • Loading branch information
andrewdavidsmith authored Dec 8, 2023
2 parents d085d4b + 6e705cb commit cf3675a
Show file tree
Hide file tree
Showing 11 changed files with 746 additions and 54 deletions.
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ dnmtools_SOURCES += src/utils/fast-liftover.cpp
dnmtools_SOURCES += src/utils/covered.cpp
dnmtools_SOURCES += src/utils/recovered.cpp
dnmtools_SOURCES += src/utils/kmersites.cpp
dnmtools_SOURCES += src/utils/xcounts.cpp
dnmtools_SOURCES += src/utils/unxcounts.cpp

dnmtools_SOURCES += src/amrfinder/allelicmeth.cpp
dnmtools_SOURCES += src/amrfinder/amrfinder.cpp
Expand Down
57 changes: 38 additions & 19 deletions src/analysis/methcounts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ using std::unordered_set;
using std::unordered_map;

using bamxx::bam_rec;
using bamxx::bam_header;
using bamxx::bgzf_file;

struct quick_buf : public std::ostringstream,
public std::basic_stringbuf<char> {
Expand Down Expand Up @@ -218,7 +220,7 @@ tag_with_mut(const uint32_t tag, const bool mut) {

template <const bool require_covered = false>
static void
write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out,
write_output(const bam_header &hdr, bgzf_file &out,
const int32_t tid, const string &chrom,
const vector<CountSet> &counts, bool CPG_ONLY) {

Expand Down Expand Up @@ -322,7 +324,7 @@ count_states_neg(const bam_rec &aln, vector<CountSet> &counts) {


static unordered_map<int32_t, size_t>
get_tid_to_idx(const bamxx::bam_header &hdr,
get_tid_to_idx(const bam_header &hdr,
const unordered_map<string, size_t> name_to_idx) {
unordered_map<int32_t, size_t> tid_to_idx;
for (int32_t i = 0; i < hdr.h->n_targets; ++i) {
Expand All @@ -342,15 +344,15 @@ template <class CS>
static void
output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid,
const unordered_map<int32_t, size_t> &tid_to_idx,
const bamxx::bam_header &hdr,
const bam_header &hdr,
const vector<string>::const_iterator chroms_beg,
const vector<size_t> &chrom_sizes,
vector<CS> &counts, bamxx::bgzf_file &out) {
vector<CS> &counts, bgzf_file &out) {

// get the index of the next chrom sequence
const auto chrom_idx = tid_to_idx.find(tid);
if (chrom_idx == cend(tid_to_idx))
throw dnmt_error("chrom not found: " + string(sam_hdr_tid2name(hdr, tid)));
throw dnmt_error("chrom not found: " + sam_hdr_tid2name(hdr, tid));

const auto chrom_itr = chroms_beg + chrom_idx->second;

Expand All @@ -363,15 +365,15 @@ output_skipped_chromosome(const bool CPG_ONLY, const int32_t tid,


static bool
consistent_targets(const bamxx::bam_header &hdr,
consistent_targets(const bam_header &hdr,
const unordered_map<int32_t, size_t> &tid_to_idx,
const vector<string> &names, const vector<size_t> &sizes) {
const size_t n_targets = hdr.h->n_targets;
if (n_targets != names.size()) return false;

for (size_t tid = 0; tid < n_targets; ++tid) {
const string tid_name_sam = sam_hdr_tid2name(hdr.h, tid);
const size_t tid_size_sam = sam_hdr_tid2len(hdr.h, tid);
const string tid_name_sam = sam_hdr_tid2name(hdr, tid);
const size_t tid_size_sam = sam_hdr_tid2len(hdr, tid);
const auto idx_itr = tid_to_idx.find(tid);
if (idx_itr == cend(tid_to_idx)) return false;
const auto idx = idx_itr->second;
Expand All @@ -380,13 +382,24 @@ consistent_targets(const bamxx::bam_header &hdr,
return true;
}


template<const bool require_covered = false>
static void
write_header(const bam_header &hdr, bgzf_file &out) {
for (auto i = 0; i < hdr.h->n_targets; ++i) {
const size_t tid_size = sam_hdr_tid2len(hdr, i);
const string tid_name = sam_hdr_tid2name(hdr, i);
std::ostringstream oss;
oss << "# " << tid_name << ' ' << tid_size << '\n';
out.write(oss.str());
}
out.write("# end_header\n");
}

template<const bool require_covered = false> static void
process_reads(const bool VERBOSE, const bool show_progress,
const bool compress_output, const size_t n_threads,
const string &infile, const string &outfile,
const string &chroms_file, const bool CPG_ONLY) {
const bool compress_output, const bool include_header,
const size_t n_threads, const string &infile,
const string &outfile, const string &chroms_file,
const bool CPG_ONLY) {
// first get the chromosome names and sequences from the FASTA file
vector<string> chroms, names;
read_fasta_file_short_names(chroms_file, names, chroms);
Expand All @@ -410,7 +423,7 @@ process_reads(const bool VERBOSE, const bool show_progress,
bamxx::bam_in hts(infile);
if (!hts) throw dnmt_error("failed to open input file");
// load the input file's header
bamxx::bam_header hdr(hts);
bam_header hdr(hts);
if (!hdr) throw dnmt_error("failed to read header");

unordered_map<int32_t, size_t> tid_to_idx = get_tid_to_idx(hdr, name_to_idx);
Expand All @@ -420,7 +433,7 @@ process_reads(const bool VERBOSE, const bool show_progress,

// open the output file
const string output_mode = compress_output ? "w" : "wu";
bamxx::bgzf_file out(outfile, output_mode);
bgzf_file out(outfile, output_mode);
if (!out) throw dnmt_error("error opening output file: " + outfile);

// set the threads for the input file decompression
Expand All @@ -429,6 +442,9 @@ process_reads(const bool VERBOSE, const bool show_progress,
tp.set_io(out);
}

if (include_header)
write_header(hdr, out);

// now iterate over the reads, switching chromosomes and writing
// output as needed
bam_rec aln;
Expand Down Expand Up @@ -499,6 +515,7 @@ main_counts(int argc, const char **argv) {
bool CPG_ONLY = false;
bool require_covered = false;
bool compress_output = false;
bool include_header = false;

string chroms_file;
string outfile;
Expand All @@ -519,6 +536,8 @@ main_counts(int argc, const char **argv) {
false, CPG_ONLY);
opt_parse.add_opt("require-covered", 'r', "only output covered sites",
false, require_covered);
opt_parse.add_opt("header", 'H', "add a header to identify the reference",
false, include_header);
opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output);
opt_parse.add_opt("progress", '\0', "show progress", false, show_progress);
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
Expand Down Expand Up @@ -559,12 +578,12 @@ main_counts(int argc, const char **argv) {

if (require_covered)
process_reads<true>(VERBOSE, show_progress, compress_output,
n_threads, mapped_reads_file, outfile,
include_header, n_threads, mapped_reads_file, outfile,
chroms_file, CPG_ONLY);
else
process_reads(VERBOSE, show_progress, compress_output,
n_threads, mapped_reads_file, outfile,
chroms_file, CPG_ONLY);
process_reads(VERBOSE, show_progress, compress_output, include_header,
n_threads, mapped_reads_file, outfile, chroms_file,
CPG_ONLY);
}
catch (const std::exception &e) {
cerr << e.what() << endl;
Expand Down
26 changes: 18 additions & 8 deletions src/common/MSite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,14 @@ using std::cbegin;
using std::cend;
using std::end;

MSite::MSite(const string &line) {

bool
MSite::initialize(const char *c, const char *c_end) {
constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };

bool failed = false;

const auto c = line.data();
const auto c_end = c + line.size();

auto field_s = c;
auto field_e = find_if(field_s + 1, c_end, is_sep);
if (field_e == c_end) failed = true;
Expand Down Expand Up @@ -97,14 +96,25 @@ MSite::MSite(const string &line) {
const auto [ptr, ec] = from_chars(field_s, c_end, n_reads);
failed = failed || (ptr != c_end);
}
// ADS: the value below would work for a flag
// pos = std::numeric_limits<decltype(pos)>::max();

return !failed;
}


if (failed) {
MSite::MSite(const string &line) {
if (!initialize(line.data(), line.data() + size(line)))
throw runtime_error("bad count line: " + line);
// ADS: the value below would work for a flag
// pos = std::numeric_limits<decltype(pos)>::max();
}
}


MSite::MSite(const char *line, const int n) {
if (!initialize(line, line + n))
throw runtime_error("bad count line: " + string(line));
}


string
MSite::tostring() const {
std::ostringstream oss;
Expand Down
3 changes: 3 additions & 0 deletions src/common/MSite.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ struct MSite {
chrom(_chrom), pos(_pos), strand(_strand),
context(_context), meth(_meth), n_reads(_n_reads) {}
explicit MSite(const std::string &line);
explicit MSite(const char *line, const int n);

bool initialize(const char *c, const char *c_end);

std::string chrom;
size_t pos;
Expand Down
5 changes: 5 additions & 0 deletions src/common/bam_record_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ sam_hdr_tid2name(const bamxx::bam_header &hdr, const int32_t tid) {
return std::string(sam_hdr_tid2name(hdr.h, tid));
}

inline uint32_t
sam_hdr_tid2len(const bamxx::bam_header &hdr, const int32_t tid) {
return sam_hdr_tid2len(hdr.h, tid);
}

inline std::string
sam_hdr_tid2name(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln) {
return std::string(sam_hdr_tid2name(hdr.h, aln.b->core.tid));
Expand Down
6 changes: 6 additions & 0 deletions src/dnmtools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ metagene(int argc, const char **argv);
int
main_covered(int argc, const char **argv);
int
main_xcounts(int argc, const char **argv);
int
main_unxcounts(int argc, const char **argv);
int
main_recovered(int argc, const char **argv);
int
kmersites(int argc, const char **argv);
Expand Down Expand Up @@ -200,6 +204,8 @@ main(int argc, const char **argv) {
{"merge", "merge multiple counts files into a counts file or a table", main_merge_methcounts},
{"covered", "filter a counts file for only covered sites", main_covered},
{"recovered", "replace missing sites in a counts file", main_recovered},
{"xcounts", "compress counts files by removing information", main_xcounts},
{"unxcounts", "reverse the xcounts process yielding a counts file", main_unxcounts},
{"selectsites", "sites inside a set of genomic intervals", main_selectsites},
{"kmersites", "make track file for sites matching kmer", kmersites}}}}}};
// clang-format on
Expand Down
2 changes: 1 addition & 1 deletion src/utils/covered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ main_covered(int argc, const char **argv) {

/****************** COMMAND LINE OPTIONS ********************/
OptionParser opt_parse(strip_path(argv[0]), description,
"<methcounts-file> (\"-\" for standard input)", 1);
"<counts-file> (\"-\" for standard input)", 1);
opt_parse.add_opt("output", 'o', "output file (default is standard out)",
false, outfile);
opt_parse.add_opt("threads", 't', "threads for compression (use few)",
Expand Down
6 changes: 5 additions & 1 deletion src/utils/recovered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads,
int32_t prev_idx = -1;

while (getline(in, line)) {
if (line[0] == '#') continue;
line.resize(line.find_first_of(" \t"));
if (line != prev_chrom) {
if (verbose) cerr << "verifying: " << line << endl;
Expand Down Expand Up @@ -273,8 +274,11 @@ process_sites(const bool verbose, const bool add_missing_chroms,
// ADS: this is probably a poor strategy since we already would know
// the index of the chrom sequence in the vector.
chrom_itr_t chrom_itr;
string line;

while (read_site(in, site)) {
while (getline(in, line)) {
if (line[0] == '#') continue;
site.initialize(line.data(), line.data() + size(line));
if (site.chrom != chrom_name) {

if (pos != num_lim<uint64_t>::max())
Expand Down
Loading

0 comments on commit cf3675a

Please sign in to comment.