diff --git a/Makefile.am b/Makefile.am index 368a8d54..e30d299d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -145,6 +145,7 @@ noinst_LIBRARIES = libdnmtools.a libdnmtools_a_SOURCES = \ src/common/dnmtools_gaussinv.cpp \ src/common/bam_record_utils.cpp \ + src/common/counts_header.cpp \ src/common/BetaBin.cpp \ src/common/EmissionDistribution.cpp \ src/common/Epiread.cpp \ @@ -163,6 +164,7 @@ libdnmtools_a_SOURCES += \ src/bamxx/bamxx.hpp \ src/common/dnmtools_gaussinv.hpp \ src/common/bam_record_utils.hpp \ + src/common/counts_header.hpp \ src/common/BetaBin.hpp \ src/common/EmissionDistribution.hpp \ src/common/Epiread.hpp \ diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp index 4ecb4812..11cb476a 100644 --- a/src/amrfinder/amrfinder.cpp +++ b/src/amrfinder/amrfinder.cpp @@ -472,10 +472,10 @@ main_amrfinder(int argc, const char **argv) { const EpireadStats epistat{low_prob, high_prob, critical_value, max_itr, use_bic, correct_for_read_count}; - // bamxx::bam_tpool tp(n_threads); + bamxx::bam_tpool tp(n_threads); bgzf_file in(reads_file, "r"); if (!in) throw runtime_error("failed to open input file: " + reads_file); - // if (n_threads > 1) tp.set_io(in); + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); vector amrs; diff --git a/src/analysis/hmr.cpp b/src/analysis/hmr.cpp index 4d06ffb4..882caed9 100644 --- a/src/analysis/hmr.cpp +++ b/src/analysis/hmr.cpp @@ -33,6 +33,7 @@ #include "TwoStateHMM.hpp" #include "MSite.hpp" +#include "counts_header.hpp" using std::string; using std::vector; @@ -337,6 +338,9 @@ load_cpgs(const string &cpgs_file, vector &cpgs, bgzf_file in(cpgs_file, "r"); if (!in) throw runtime_error("failed opening file: " + cpgs_file); + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + MSite prev_site, the_site; while (read_site(in, the_site)) { if (!the_site.is_cpg() || distance(prev_site, the_site) < 2) diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp index 76aeaefd..4eab69e6 100644 --- a/src/analysis/methcounts.cpp +++ b/src/analysis/methcounts.cpp @@ -36,6 +36,7 @@ #include "bsutils.hpp" #include "dnmt_error.hpp" #include "bam_record_utils.hpp" +#include "counts_header.hpp" /* HTSlib */ #include @@ -382,17 +383,6 @@ consistent_targets(const bam_header &hdr, return true; } -static void -write_header(const bam_header &hdr, bgzf_file &out) { - for (auto i = 0; i < hdr.h->n_targets; ++i) { - const size_t tid_size = sam_hdr_tid2len(hdr, i); - const string tid_name = sam_hdr_tid2name(hdr, i); - std::ostringstream oss; - oss << "# " << tid_name << ' ' << tid_size << '\n'; - out.write(oss.str()); - } - out.write("# end_header\n"); -} template static void process_reads(const bool VERBOSE, const bool show_progress, @@ -443,7 +433,7 @@ process_reads(const bool VERBOSE, const bool show_progress, } if (include_header) - write_header(hdr, out); + write_counts_header_from_bam_header(hdr, out); // now iterate over the reads, switching chromosomes and writing // output as needed diff --git a/src/analysis/methstates.cpp b/src/analysis/methstates.cpp index 9133bc75..18e28572 100644 --- a/src/analysis/methstates.cpp +++ b/src/analysis/methstates.cpp @@ -254,14 +254,17 @@ main_methstates(int argc, const char **argv) { bamxx::bam_header hdr(in); if (!hdr) throw dnmt_error("cannot read heade" + mapped_reads_file); - /* set the threads for the input file decompression */ - if (n_threads > 1) tp.set_io(in); - // open the output file const string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); if (!out) throw dnmt_error("error opening output file: " + outfile); + /* set the threads for the input file decompression */ + if (n_threads > 1) { + tp.set_io(in); + tp.set_io(out); + } + vector cpgs; unordered_set chroms_seen; diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 92bee14b..07c3e3b1 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -33,6 +33,7 @@ #include "GenomicRegion.hpp" #include "OptionParser.hpp" #include "bsutils.hpp" +#include "counts_header.hpp" #include "TwoStateHMM_PMD.hpp" #include "MSite.hpp" @@ -240,8 +241,11 @@ get_optimized_boundary_likelihoods(const vector &cpgs_file, static const double array_coverage_constant = 10; vector in(cpgs_file.size()); - for (size_t i = 0; i < cpgs_file.size(); ++i) + for (size_t i = 0; i < cpgs_file.size(); ++i) { in[i] = new bgzf_file(cpgs_file[i], "r"); + if (has_counts_header(cpgs_file[i])) + skip_counts_header(*in[i]); + } std::map > pos_meth_tot; size_t n_meth = 0ul, n_reads = 0ul; @@ -344,8 +348,11 @@ find_exact_boundaries(const vector &cpgs_file, static const double array_coverage_constant = 10; vector in(cpgs_file.size()); - for (size_t i = 0; i < cpgs_file.size(); ++i) + for (size_t i = 0; i < cpgs_file.size(); ++i) { in[i] = new bgzf_file(cpgs_file[i], "r"); + if (has_counts_header(cpgs_file[i])) + skip_counts_header(*in[i]); + } std::map > pos_meth_tot; size_t n_meth = 0ul, n_reads = 0ul; @@ -653,6 +660,7 @@ shuffle_bins(const size_t rng_seed, sort(begin(domain_scores), end(domain_scores)); } + static void assign_p_values(const vector &random_scores, const vector &observed_scores, @@ -665,6 +673,7 @@ assign_p_values(const vector &random_scores, } } + static void read_params_file(const bool VERBOSE, const string ¶ms_file, @@ -759,6 +768,9 @@ check_if_array_data(const string &infile) { bgzf_file in(infile, "r"); if (!in) throw std::runtime_error("bad file: " + infile); + if (has_counts_header(infile)) + skip_counts_header(in); + std::string line; getline(in, line); std::istringstream iss(line); @@ -779,6 +791,10 @@ load_array_data(const size_t bin_size, bgzf_file in(cpgs_file, "r"); if (!in) throw std::runtime_error("bad sites file: " + cpgs_file); + + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + string curr_chrom; size_t prev_pos = 0ul, curr_pos = 0ul; double array_meth_bin = 0.0; @@ -872,6 +888,9 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file, bgzf_file in(cpgs_file, "r"); if (!in) throw runtime_error("bad sites file: " + cpgs_file); + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + // keep track of the chroms we've seen string curr_chrom; std::unordered_set chroms_seen; @@ -947,6 +966,9 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, bgzf_file in(cpgs_file, "r"); if (!in) throw runtime_error("bad methcounts file: " + cpgs_file); + if (has_counts_header(cpgs_file)) + skip_counts_header(in); + // keep track of where we are and what we've seen size_t bin_start = 0ul; string curr_chrom; @@ -970,6 +992,7 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, } } + static double good_bins_frac(const vector &cumulative, const size_t min_bin_size, const size_t bin_size, const size_t min_cov_to_pass) { diff --git a/src/bamxx b/src/bamxx index 09cfe360..7c2228c2 160000 --- a/src/bamxx +++ b/src/bamxx @@ -1 +1 @@ -Subproject commit 09cfe360be875271014b2824fecd8b732e640630 +Subproject commit 7c2228c2f0233e9df1f1534f374e08c0a196ae55 diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index 36e47fcf..27bd1a22 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -24,6 +24,7 @@ #include #include "smithlab_utils.hpp" +#include "counts_header.hpp" using std::string; using std::runtime_error; @@ -361,7 +362,8 @@ is_msite_file(const string &file) { if (!in) throw runtime_error("cannot open file: " + file); string line; - if (!getline(in, line)) return false; + while (getline(in, line) && is_counts_header_line(line)) + ; return is_msite_line(line); } diff --git a/src/common/counts_header.cpp b/src/common/counts_header.cpp new file mode 100644 index 00000000..e6a814b0 --- /dev/null +++ b/src/common/counts_header.cpp @@ -0,0 +1,163 @@ +/* xcounts_utils: code for doing things with xcounts format and some + * for counts format that is common to several tools. + * + * Copyright (C) 2023 Andrew D. Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "counts_header.hpp" + +#include +#include +#include +#include +#include + +#include "bam_record_utils.hpp" + +// generated by autotools +#include + +#include "bamxx.hpp" + +using std::vector; +using std::string; +using std::to_string; + +using bamxx::bgzf_file; + +void +write_counts_header_from_chom_sizes(const vector &chrom_names, + const vector &chrom_sizes, + bgzf_file &out) { + const auto version = "#DNMTOOLS " + string(VERSION) + "\n"; + out.write(version.c_str()); + for (auto i = 0u; i < size(chrom_sizes); ++i) { + const string tmp = + "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n"; + out.write(tmp.c_str()); + } + out.write("#\n"); +} + + +inline bgzf_file & +getline(bgzf_file &file, kstring_t &line) { + if (file.f == nullptr) return file; + const int x = bgzf_getline(file.f, '\n', &line); + if (x == -1) { + file.destroy(); + free(line.s); + line = {0, 0, nullptr}; + } + if (x < -1) { + // ADS: this is an error condition and should be handled + // differently from the EOF above. + file.destroy(); + free(line.s); + line = {0, 0, nullptr}; + } + return file; +} + + +bamxx::bgzf_file & +skip_counts_header(bamxx::bgzf_file &in) { + + // use the kstring_t type to more directly use the BGZF file + kstring_t line{0, 0, nullptr}; + const int ret = ks_resize(&line, 1024); + if (ret) return in; + + while (getline(in, line) && line.s[0] == '#') { + if (line.s[0] == '#' && line.l == 1) + return in; + } + // otherwise we have missed the final line of the header + assert(line.s[0] != '#'); + return in; +} + + +int +get_chrom_sizes_for_counts_header(const uint32_t n_threads, + const string &filename, + vector &chrom_names, + vector &chrom_sizes) { + + bamxx::bam_tpool tpool(n_threads); + + bgzf_file in(filename, "r"); + if (!in) return -1; + if (n_threads > 1 && in.is_bgzf()) + tpool.set_io(in); + + chrom_names.clear(); + chrom_sizes.clear(); + + // use the kstring_t type to more directly use the BGZF file + kstring_t line{0, 0, nullptr}; + const int ret = ks_resize(&line, 1024); + if (ret) return -1; + + uint64_t chrom_size = 0; + while (getline(in, line)) { + if (line.s[0] == '>') { + if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + chrom_names.emplace_back(line.s + 1); + chrom_size = 0; + } + else chrom_size += line.l; + } + if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + + ks_free(&line); + + assert(size(chrom_names) == size(chrom_sizes)); + + return 0; +} + + +void +write_counts_header_from_bam_header(const bamxx::bam_header &hdr, + bgzf_file &out) { + const auto version = "#DNMTOOLS " + string(VERSION) + "\n"; + out.write(version.c_str()); + for (auto i = 0; i < hdr.h->n_targets; ++i) { + const size_t tid_size = sam_hdr_tid2len(hdr, i); + const string tid_name = sam_hdr_tid2name(hdr, i); + std::ostringstream oss; + oss << "#" << tid_name << ' ' << tid_size << '\n'; + out.write(oss.str()); + } + out.write("#\n"); +} + + +bool +write_counts_header_line(string line, bgzf_file &out) { + line += '\n'; + const int64_t ret = bgzf_write(out.f, line.data(), size(line)); + return ret == static_cast(size(line)); +} + +bool +has_counts_header(const string &filename) { + bgzf_file in(filename, "r"); + if (!in) return false; + string line; + if (!getline(in, line)) return false; + return line[0] == '#'; +} diff --git a/src/common/counts_header.hpp b/src/common/counts_header.hpp new file mode 100644 index 00000000..171eff28 --- /dev/null +++ b/src/common/counts_header.hpp @@ -0,0 +1,65 @@ +/* xcounts_utils: code for doing things with xcounts format and some + * for counts format that is common to several tools. + * + * Copyright (C) 2023 Andrew D. Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef COUNTS_HEADER_HPP +#define COUNTS_HEADER_HPP + +#include +#include +#include + +#include "bamxx.hpp" + +void +write_counts_header_from_chom_sizes(const std::vector &chrom_names, + const std::vector &chrom_sizes, + bamxx::bgzf_file &out); + +// returns -1 on failure, 0 on success +int +get_chrom_sizes_for_counts_header(const uint32_t n_threads, + const std::string &filename, + std::vector &chrom_names, + std::vector &chrom_sizes); + +void +write_counts_header_from_bam_header(const bamxx::bam_header &hdr, + bamxx::bgzf_file &out); + +bool +write_counts_header_line(std::string line, bamxx::bgzf_file &out); + +bamxx::bgzf_file & +skip_counts_header(bamxx::bgzf_file &in); + +bool +has_counts_header(const std::string &filename); + +inline bool +is_counts_header_version_line(const std::string &line) { + const auto version_line = "#DNMTOOLS"; + return line.compare(0, 9, version_line) == 0; +} + +template +inline bool +is_counts_header_line(T &line) { + return line[0] == '#'; +} + +#endif diff --git a/src/utils/covered.cpp b/src/utils/covered.cpp index b96aca56..26efbc75 100644 --- a/src/utils/covered.cpp +++ b/src/utils/covered.cpp @@ -113,14 +113,15 @@ main_covered(int argc, const char **argv) { bgzf_file in(filename, "r"); if (!in) throw runtime_error("could not open file: " + filename); - const auto outfile_mode = in.compression() ? "w" : "wu"; + const auto outfile_mode = in.is_compressed() ? "w" : "wu"; bgzf_file out(outfile, outfile_mode); if (!out) throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { // ADS: something breaks when we use the thread for the input - tpool.set_io(in); + if (in.is_bgzf()) + tpool.set_io(in); tpool.set_io(out); } diff --git a/src/utils/recovered.cpp b/src/utils/recovered.cpp index 60b9a62f..c268c238 100644 --- a/src/utils/recovered.cpp +++ b/src/utils/recovered.cpp @@ -30,6 +30,7 @@ #include "smithlab_os.hpp" #include "bsutils.hpp" #include "dnmt_error.hpp" +#include "counts_header.hpp" #include "MSite.hpp" @@ -57,7 +58,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, bamxx::bam_tpool tp(n_threads); // set the threads for the input file decompression - if (n_threads > 1) + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); unordered_set chroms_seen; @@ -67,7 +68,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, int32_t prev_idx = -1; while (getline(in, line)) { - if (line[0] == '#') continue; + if (is_counts_header_line(line)) continue; line.resize(line.find_first_of(" \t")); if (line != prev_chrom) { if (verbose) cerr << "verifying: " << line << endl; @@ -277,7 +278,10 @@ process_sites(const bool verbose, const bool add_missing_chroms, string line; while (getline(in, line)) { - if (line[0] == '#') continue; + if (is_counts_header_line(line)) { + write_counts_header_line(line, out); + continue; + } site.initialize(line.data(), line.data() + size(line)); if (site.chrom != chrom_name) { diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index 0c3844ee..3c4190cb 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -33,6 +33,8 @@ #include "smithlab_os.hpp" #include "smithlab_utils.hpp" +#include "counts_header.hpp" + using std::cerr; using std::cout; using std::endl; @@ -61,9 +63,8 @@ get_first_site(T &in, T &out) { string line; bool within_header = true; while (within_header && getline(in, line)) { - if (line[0] == '#') { - line += '\n'; - out.write(line); + if (is_counts_header_line(line)) { + write_counts_header_line(line, out); } else { prev_site.initialize(line.data(), line.data() + size(line)); @@ -174,7 +175,7 @@ main_symmetric_cpgs(int argc, const char **argv) { if (!out) throw runtime_error("error opening output file: " + outfile); if (n_threads > 1) { - tp.set_io(in); + if (in.is_bgzf()) tp.set_io(in); tp.set_io(out); } diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index d78cf4e7..156eb885 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -31,6 +31,7 @@ #include "smithlab_os.hpp" #include "bsutils.hpp" #include "dnmt_error.hpp" +#include "counts_header.hpp" #include "MSite.hpp" @@ -50,6 +51,7 @@ using std::cbegin; using std::cend; using std::copy; using std::copy_n; +using std::to_string; using bamxx::bgzf_file; @@ -85,7 +87,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, bamxx::bam_tpool tp(n_threads); // set the threads for the input file decompression - if (n_threads > 1) + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); unordered_set chroms_seen; @@ -96,7 +98,7 @@ verify_chrom_orders(const bool verbose, const uint32_t n_threads, if (ret) throw runtime_error("failed to acquire buffer"); while (getline(in, line)) { - if (line.s[0] == '#') continue; + if (is_counts_header_line(line.s)) continue; if (std::isdigit(line.s[0])) continue; string chrom{line.s}; @@ -251,8 +253,10 @@ write_site(const string &name, const string &chrom, return bgzf_write(out.f, buf.data(), sz) == sz; } + typedef vector::const_iterator chrom_itr_t; + static chrom_itr_t get_chrom(const unordered_map &chrom_lookup, const string &chrom_name) { @@ -262,6 +266,7 @@ get_chrom(const unordered_map &chrom_lookup, return chrom_idx->second; } + static int32_t get_chrom_idx(const unordered_map &name_to_idx, const string &chrom_name) { @@ -271,8 +276,29 @@ get_chrom_idx(const unordered_map &name_to_idx, return chrom_idx->second; } + +static bool +verify_chrom(const string &header_line, + const unordered_map &name_to_idx, + vector &chrom_sizes) { + if (is_counts_header_version_line(header_line)) + return true; + std::istringstream iss(header_line.substr(1)); + string name; + uint64_t chrom_size = 0; + if (!(iss >> name >> chrom_size)) + return false; + + const auto idx = name_to_idx.find(name); + if (idx == cend(name_to_idx)) return false; + + return chrom_size == chrom_sizes[idx->second]; +} + + static void process_sites(const bool verbose, const bool add_missing_chroms, + const bool require_covered, const bool compress_output, const size_t n_threads, const string &infile, const string &outfile, const string &chroms_file) { @@ -309,7 +335,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, // set the threads for the input file decompression if (n_threads > 1) { - tp.set_io(in); + if (in.is_bgzf()) tp.set_io(in); tp.set_io(out); } @@ -328,16 +354,18 @@ process_sites(const bool verbose, const bool add_missing_chroms, chrom_itr_t chrom_itr; while (getline(in, line)) { - if (line.s[0] == '#') { - line.s[line.l++] = '\n'; - if (bgzf_write(out.f, line.s, line.l) != static_cast(line.l)) - throw runtime_error("failed to convert"); + if (is_counts_header_line(line.s)) { + string header_line{line.s}; + if (size(header_line) > 1 && !verify_chrom(header_line, name_to_idx, chrom_sizes)) + throw runtime_error("failed to verify header for: " + header_line); + if (!write_counts_header_line(header_line, out)) + throw runtime_error("failed to write header line: " + header_line); continue; } if (!std::isdigit(line.s[0])) { - if (pos != num_lim::max()) + if (!require_covered && pos != num_lim::max()) write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); chrom_name = string{line.s}; @@ -369,14 +397,15 @@ process_sites(const bool verbose, const bool add_missing_chroms, res = from_chars(res.ptr + 1, end_line, n_unmeth); const auto curr_pos = pos + pos_step; - if (pos + 1 < curr_pos) + if (!require_covered && pos + 1 < curr_pos) write_missing_sites(chrom_name, *chrom_itr, pos + 1, curr_pos, buf, out); write_site(chrom_name, *chrom_itr, curr_pos, n_meth, n_unmeth, buf, out); pos = curr_pos; } } - write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); + if (!require_covered) + write_missing_sites(chrom_name, *chrom_itr, pos + 1, size(*chrom_itr), buf, out); if (add_missing_chroms) { const int32_t chrom_idx = size(chroms); @@ -397,6 +426,7 @@ main_unxcounts(int argc, const char **argv) { bool verbose = false; bool add_missing_chroms = false; + bool require_covered = false; bool compress_output = false; size_t n_threads = 1; @@ -410,6 +440,8 @@ main_unxcounts(int argc, const char **argv) { ""); opt_parse.add_opt("output", 'o', "output file (required)", true, outfile); opt_parse.add_opt("missing", 'm', "add missing chroms", false, add_missing_chroms); + opt_parse.add_opt("reads", 'r', "report only sites with reads", false, + require_covered); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("chrom", 'c', "reference genome file (FASTA format)", true , chroms_file); @@ -434,11 +466,20 @@ main_unxcounts(int argc, const char **argv) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; } + if (require_covered && add_missing_chroms) { + cerr << "options mutually exclusive: reads and missing" << endl; + return EXIT_FAILURE; + } const string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - process_sites(verbose, add_missing_chroms, compress_output, n_threads, - filename, outfile, chroms_file); + if (require_covered && add_missing_chroms) { + cerr << "options mutually exclusive: reads and missing" << endl; + return EXIT_FAILURE; + } + + process_sites(verbose, add_missing_chroms, require_covered, compress_output, + n_threads, filename, outfile, chroms_file); } catch (const std::runtime_error &e) { cerr << e.what() << endl; diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index a1ffe46a..ab271e6b 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -29,7 +29,10 @@ #include "OptionParser.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" + #include "MSite.hpp" +#include "counts_header.hpp" +#include "dnmt_error.hpp" using std::cerr; using std::cout; @@ -38,6 +41,7 @@ using std::runtime_error; using std::string; using std::to_chars; using std::vector; +using std::to_string; using bamxx::bgzf_file; @@ -78,7 +82,9 @@ fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) { int main_xcounts(int argc, const char **argv) { try { + bool require_coverage = false; size_t n_threads = 1; + string genome_file; string outfile{"-"}; const string description = @@ -89,6 +95,10 @@ main_xcounts(int argc, const char **argv) { " (\"-\" for standard input)", 1); opt_parse.add_opt("output", 'o', "output file (default is standard out)", false, outfile); + opt_parse.add_opt("chroms", 'c', "make header from this reference", + false, genome_file); + opt_parse.add_opt("reads", 'r', "ouput only sites with reads", + false, require_coverage); opt_parse.add_opt("threads", 't', "threads for compression (use few)", false, n_threads); std::vector leftover_args; @@ -113,40 +123,54 @@ main_xcounts(int argc, const char **argv) { const string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ + vector chrom_names; + vector chrom_sizes; + if (!genome_file.empty()) { + const int ret = + get_chrom_sizes_for_counts_header(n_threads, genome_file, + chrom_names, chrom_sizes); + if (ret) throw dnmt_error("failed to get chrom sizes from: " + + genome_file); + } + + bamxx::bam_tpool tpool(n_threads); bgzf_file in(filename, "r"); - if (!in) throw runtime_error("could not open file: " + filename); + if (!in) throw dnmt_error("could not open file: " + filename); - const auto outfile_mode = in.compression() ? "w" : "wu"; + const auto outfile_mode = in.is_compressed() ? "w" : "wu"; bgzf_file out(outfile, outfile_mode); - if (!out) throw runtime_error("error opening output file: " + outfile); + if (!out) throw dnmt_error("error opening output file: " + outfile); if (n_threads > 1) { // ADS: something breaks when we use the thread for the input - tpool.set_io(in); + if (in.is_bgzf()) + tpool.set_io(in); tpool.set_io(out); } + if (!genome_file.empty()) + write_counts_header_from_chom_sizes(chrom_names, chrom_sizes, out); + // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); - if (ret) throw runtime_error("failed to acquire buffer"); + if (ret) throw dnmt_error("failed to acquire buffer"); vector buf(128); uint32_t offset = 0; string prev_chrom; bool status_ok = true; - int64_t sz = 0; MSite site; while (status_ok && getline(in, line)) { - if (line.s[0] == '#') { - const auto header_line = string(line.s) + "\n"; - sz = size(header_line); - status_ok = bgzf_write(out.f, header_line.data(), sz) == sz; + if (is_counts_header_line(line.s)) { + if (!genome_file.empty()) continue; + const string header_line{line.s}; + write_counts_header_line(header_line, out); continue; } status_ok = site.initialize(line.s, line.s + line.l); @@ -157,22 +181,24 @@ main_xcounts(int argc, const char **argv) { offset = 0; site.chrom += '\n'; - sz = size(site.chrom); + const int64_t sz = size(site.chrom); status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz; } - if (site.n_reads > 0 || site.is_mutated()) { - sz = fill_output_buffer(offset, site, buf); + if (site.n_reads > 0) { + const int64_t sz = fill_output_buffer(offset, site, buf); status_ok = bgzf_write(out.f, buf.data(), sz) == sz; offset = site.pos; } } + ks_free(&line); + if (!status_ok) { cerr << "failed converting " << filename << " to " << outfile << endl; return EXIT_FAILURE; } } - catch (const std::runtime_error &e) { + catch (const std::exception &e) { cerr << e.what() << endl; return EXIT_FAILURE; }