Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Max buffer size argument for uniq #201

Merged
merged 23 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b786395
xcounts: adding code to get a header from a fasta reference genome file
andrewdavidsmith Dec 9, 2023
09eea21
xcounts: testing some threading issues
andrewdavidsmith Dec 9, 2023
048408e
several source files: checking that input file is BGZF format before …
andrewdavidsmith Dec 9, 2023
f658558
bamxx submodule update
andrewdavidsmith Dec 9, 2023
5b3568a
xcounts: fixing a bug from previous commit
andrewdavidsmith Dec 9, 2023
f50caba
unxcounts: including logic to verify header against chromosomes seque…
andrewdavidsmith Dec 9, 2023
16a3fdc
unxcounts: adding logic to only generate the counts for covered sites
andrewdavidsmith Dec 9, 2023
2583dc8
xcounts: ignoring the mutation mark on sites
andrewdavidsmith Dec 9, 2023
84846ce
unxcounts: better error message on inconsistent header
andrewdavidsmith Dec 9, 2023
3edadb1
unxcounts: trying to find the best header termination
andrewdavidsmith Dec 9, 2023
fd39ce8
unxcounts: better handling of header
andrewdavidsmith Dec 9, 2023
6db3e53
xcounts and counts: header no longer ends with end_header and also th…
andrewdavidsmith Dec 9, 2023
76ecc8c
unxcounts: fixing backwards condition for requiring coverage
andrewdavidsmith Dec 9, 2023
944e660
unxcounts: adding a missed spot to check for required coverage
andrewdavidsmith Dec 9, 2023
0e329e9
counts_header.cpp counts_header.hpp Makefile.am: adding counts_header…
andrewdavidsmith Dec 9, 2023
2bfcef0
MSite.cpp: adding a loop to pass the header when trying to determine …
andrewdavidsmith Dec 9, 2023
dd6cb9d
hmr.cpp and pmd.cpp: adding code to skip the header when reading sites
andrewdavidsmith Dec 9, 2023
699a0f2
methcounts.cpp: refactoring to move the header code into counts_heade…
andrewdavidsmith Dec 9, 2023
edeb17d
recovered.cpp: refactoring to move the header stuff into counts_header
andrewdavidsmith Dec 9, 2023
c13c28d
smmetric-cpgs.cpp: refactoring to move the header stuff into counts_h…
andrewdavidsmith Dec 9, 2023
1e1707c
xcounts.cpp: refactoring to move the header stuff into counts_header
andrewdavidsmith Dec 9, 2023
6acba7b
unxcounts.cpp: refactoring to move the header stuff into counts_header
andrewdavidsmith Dec 9, 2023
32fd022
uniq.cpp: adding a variable to control the max buffer size which matt…
andrewdavidsmith Dec 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ noinst_LIBRARIES = libdnmtools.a
libdnmtools_a_SOURCES = \
src/common/dnmtools_gaussinv.cpp \
src/common/bam_record_utils.cpp \
src/common/counts_header.cpp \
src/common/BetaBin.cpp \
src/common/EmissionDistribution.cpp \
src/common/Epiread.cpp \
Expand All @@ -163,6 +164,7 @@ libdnmtools_a_SOURCES += \
src/bamxx/bamxx.hpp \
src/common/dnmtools_gaussinv.hpp \
src/common/bam_record_utils.hpp \
src/common/counts_header.hpp \
src/common/BetaBin.hpp \
src/common/EmissionDistribution.hpp \
src/common/Epiread.hpp \
Expand Down
4 changes: 2 additions & 2 deletions src/amrfinder/amrfinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,10 +472,10 @@ main_amrfinder(int argc, const char **argv) {
const EpireadStats epistat{low_prob, high_prob, critical_value,
max_itr, use_bic, correct_for_read_count};

// bamxx::bam_tpool tp(n_threads);
bamxx::bam_tpool tp(n_threads);
bgzf_file in(reads_file, "r");
if (!in) throw runtime_error("failed to open input file: " + reads_file);
// if (n_threads > 1) tp.set_io(in);
if (n_threads > 1 && in.is_bgzf()) tp.set_io(in);

vector<GenomicRegion> amrs;

Expand Down
4 changes: 4 additions & 0 deletions src/analysis/hmr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

#include "TwoStateHMM.hpp"
#include "MSite.hpp"
#include "counts_header.hpp"

using std::string;
using std::vector;
Expand Down Expand Up @@ -337,6 +338,9 @@ load_cpgs(const string &cpgs_file, vector<MSite> &cpgs,
bgzf_file in(cpgs_file, "r");
if (!in) throw runtime_error("failed opening file: " + cpgs_file);

if (has_counts_header(cpgs_file))
skip_counts_header(in);

MSite prev_site, the_site;
while (read_site(in, the_site)) {
if (!the_site.is_cpg() || distance(prev_site, the_site) < 2)
Expand Down
14 changes: 2 additions & 12 deletions src/analysis/methcounts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "bsutils.hpp"
#include "dnmt_error.hpp"
#include "bam_record_utils.hpp"
#include "counts_header.hpp"

/* HTSlib */
#include <htslib/sam.h>
Expand Down Expand Up @@ -382,17 +383,6 @@ consistent_targets(const bam_header &hdr,
return true;
}

static void
write_header(const bam_header &hdr, bgzf_file &out) {
for (auto i = 0; i < hdr.h->n_targets; ++i) {
const size_t tid_size = sam_hdr_tid2len(hdr, i);
const string tid_name = sam_hdr_tid2name(hdr, i);
std::ostringstream oss;
oss << "# " << tid_name << ' ' << tid_size << '\n';
out.write(oss.str());
}
out.write("# end_header\n");
}

template<const bool require_covered = false> static void
process_reads(const bool VERBOSE, const bool show_progress,
Expand Down Expand Up @@ -443,7 +433,7 @@ process_reads(const bool VERBOSE, const bool show_progress,
}

if (include_header)
write_header(hdr, out);
write_counts_header_from_bam_header(hdr, out);

// now iterate over the reads, switching chromosomes and writing
// output as needed
Expand Down
9 changes: 6 additions & 3 deletions src/analysis/methstates.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,17 @@ main_methstates(int argc, const char **argv) {
bamxx::bam_header hdr(in);
if (!hdr) throw dnmt_error("cannot read heade" + mapped_reads_file);

/* set the threads for the input file decompression */
if (n_threads > 1) tp.set_io(in);

// open the output file
const string output_mode = compress_output ? "w" : "wu";
bamxx::bgzf_file out(outfile, output_mode);
if (!out) throw dnmt_error("error opening output file: " + outfile);

/* set the threads for the input file decompression */
if (n_threads > 1) {
tp.set_io(in);
tp.set_io(out);
}

vector<uint64_t> cpgs;

unordered_set<string> chroms_seen;
Expand Down
27 changes: 25 additions & 2 deletions src/analysis/pmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "GenomicRegion.hpp"
#include "OptionParser.hpp"
#include "bsutils.hpp"
#include "counts_header.hpp"

#include "TwoStateHMM_PMD.hpp"
#include "MSite.hpp"
Expand Down Expand Up @@ -240,8 +241,11 @@ get_optimized_boundary_likelihoods(const vector<string> &cpgs_file,
static const double array_coverage_constant = 10;

vector<bgzf_file*> in(cpgs_file.size());
for (size_t i = 0; i < cpgs_file.size(); ++i)
for (size_t i = 0; i < cpgs_file.size(); ++i) {
in[i] = new bgzf_file(cpgs_file[i], "r");
if (has_counts_header(cpgs_file[i]))
skip_counts_header(*in[i]);
}

std::map<size_t, pair<size_t, size_t> > pos_meth_tot;
size_t n_meth = 0ul, n_reads = 0ul;
Expand Down Expand Up @@ -344,8 +348,11 @@ find_exact_boundaries(const vector<string> &cpgs_file,
static const double array_coverage_constant = 10;

vector<bgzf_file*> in(cpgs_file.size());
for (size_t i = 0; i < cpgs_file.size(); ++i)
for (size_t i = 0; i < cpgs_file.size(); ++i) {
in[i] = new bgzf_file(cpgs_file[i], "r");
if (has_counts_header(cpgs_file[i]))
skip_counts_header(*in[i]);
}

std::map<size_t, pair<size_t, size_t> > pos_meth_tot;
size_t n_meth = 0ul, n_reads = 0ul;
Expand Down Expand Up @@ -653,6 +660,7 @@ shuffle_bins(const size_t rng_seed,
sort(begin(domain_scores), end(domain_scores));
}


static void
assign_p_values(const vector<double> &random_scores,
const vector<double> &observed_scores,
Expand All @@ -665,6 +673,7 @@ assign_p_values(const vector<double> &random_scores,
}
}


static void
read_params_file(const bool VERBOSE,
const string &params_file,
Expand Down Expand Up @@ -759,6 +768,9 @@ check_if_array_data(const string &infile) {
bgzf_file in(infile, "r");
if (!in) throw std::runtime_error("bad file: " + infile);

if (has_counts_header(infile))
skip_counts_header(in);

std::string line;
getline(in, line);
std::istringstream iss(line);
Expand All @@ -779,6 +791,10 @@ load_array_data(const size_t bin_size,

bgzf_file in(cpgs_file, "r");
if (!in) throw std::runtime_error("bad sites file: " + cpgs_file);

if (has_counts_header(cpgs_file))
skip_counts_header(in);

string curr_chrom;
size_t prev_pos = 0ul, curr_pos = 0ul;
double array_meth_bin = 0.0;
Expand Down Expand Up @@ -872,6 +888,9 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file,
bgzf_file in(cpgs_file, "r");
if (!in) throw runtime_error("bad sites file: " + cpgs_file);

if (has_counts_header(cpgs_file))
skip_counts_header(in);

// keep track of the chroms we've seen
string curr_chrom;
std::unordered_set<string> chroms_seen;
Expand Down Expand Up @@ -947,6 +966,9 @@ load_read_counts(const string &cpgs_file, const size_t bin_size,
bgzf_file in(cpgs_file, "r");
if (!in) throw runtime_error("bad methcounts file: " + cpgs_file);

if (has_counts_header(cpgs_file))
skip_counts_header(in);

// keep track of where we are and what we've seen
size_t bin_start = 0ul;
string curr_chrom;
Expand All @@ -970,6 +992,7 @@ load_read_counts(const string &cpgs_file, const size_t bin_size,
}
}


static double
good_bins_frac(const vector<size_t> &cumulative, const size_t min_bin_size,
const size_t bin_size, const size_t min_cov_to_pass) {
Expand Down
2 changes: 1 addition & 1 deletion src/bamxx
Submodule bamxx updated 1 files
+8 −2 bamxx.hpp
4 changes: 3 additions & 1 deletion src/common/MSite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <charconv>

#include "smithlab_utils.hpp"
#include "counts_header.hpp"

using std::string;
using std::runtime_error;
Expand Down Expand Up @@ -361,7 +362,8 @@ is_msite_file(const string &file) {
if (!in) throw runtime_error("cannot open file: " + file);

string line;
if (!getline(in, line)) return false;
while (getline(in, line) && is_counts_header_line(line))
;

return is_msite_line(line);
}
163 changes: 163 additions & 0 deletions src/common/counts_header.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/* xcounts_utils: code for doing things with xcounts format and some
* for counts format that is common to several tools.
*
* Copyright (C) 2023 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

#include "counts_header.hpp"

#include <string>
#include <vector>
#include <cassert>
#include <cstdint>
#include <sstream>

#include "bam_record_utils.hpp"

// generated by autotools
#include <config.h>

#include "bamxx.hpp"

using std::vector;
using std::string;
using std::to_string;

using bamxx::bgzf_file;

void
write_counts_header_from_chom_sizes(const vector<string> &chrom_names,
const vector<uint64_t> &chrom_sizes,
bgzf_file &out) {
const auto version = "#DNMTOOLS " + string(VERSION) + "\n";
out.write(version.c_str());
for (auto i = 0u; i < size(chrom_sizes); ++i) {
const string tmp =
"#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n";
out.write(tmp.c_str());
}
out.write("#\n");
}


inline bgzf_file &
getline(bgzf_file &file, kstring_t &line) {
if (file.f == nullptr) return file;
const int x = bgzf_getline(file.f, '\n', &line);
if (x == -1) {
file.destroy();
free(line.s);
line = {0, 0, nullptr};
}
if (x < -1) {
// ADS: this is an error condition and should be handled
// differently from the EOF above.
file.destroy();
free(line.s);
line = {0, 0, nullptr};
}
return file;
}


bamxx::bgzf_file &
skip_counts_header(bamxx::bgzf_file &in) {

// use the kstring_t type to more directly use the BGZF file
kstring_t line{0, 0, nullptr};
const int ret = ks_resize(&line, 1024);
if (ret) return in;

while (getline(in, line) && line.s[0] == '#') {
if (line.s[0] == '#' && line.l == 1)
return in;
}
// otherwise we have missed the final line of the header
assert(line.s[0] != '#');
return in;
}


int
get_chrom_sizes_for_counts_header(const uint32_t n_threads,
const string &filename,
vector<string> &chrom_names,
vector<uint64_t> &chrom_sizes) {

bamxx::bam_tpool tpool(n_threads);

bgzf_file in(filename, "r");
if (!in) return -1;
if (n_threads > 1 && in.is_bgzf())
tpool.set_io(in);

chrom_names.clear();
chrom_sizes.clear();

// use the kstring_t type to more directly use the BGZF file
kstring_t line{0, 0, nullptr};
const int ret = ks_resize(&line, 1024);
if (ret) return -1;

uint64_t chrom_size = 0;
while (getline(in, line)) {
if (line.s[0] == '>') {
if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
chrom_names.emplace_back(line.s + 1);
chrom_size = 0;
}
else chrom_size += line.l;
}
if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);

ks_free(&line);

assert(size(chrom_names) == size(chrom_sizes));

return 0;
}


void
write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
bgzf_file &out) {
const auto version = "#DNMTOOLS " + string(VERSION) + "\n";
out.write(version.c_str());
for (auto i = 0; i < hdr.h->n_targets; ++i) {
const size_t tid_size = sam_hdr_tid2len(hdr, i);
const string tid_name = sam_hdr_tid2name(hdr, i);
std::ostringstream oss;
oss << "#" << tid_name << ' ' << tid_size << '\n';
out.write(oss.str());
}
out.write("#\n");
}


bool
write_counts_header_line(string line, bgzf_file &out) {
line += '\n';
const int64_t ret = bgzf_write(out.f, line.data(), size(line));
return ret == static_cast<int64_t>(size(line));
}

bool
has_counts_header(const string &filename) {
bgzf_file in(filename, "r");
if (!in) return false;
string line;
if (!getline(in, line)) return false;
return line[0] == '#';
}
Loading