Skip to content

Commit

Permalink
Merge pull request #10 from DecodeGenetics/checksum_from_cram
Browse files Browse the repository at this point in the history
Checksum from cram
  • Loading branch information
pmelsted authored Mar 7, 2020
2 parents 1c98778 + 2fdb5d0 commit de6d8d9
Show file tree
Hide file tree
Showing 790 changed files with 163,163 additions and 148,688 deletions.
20 changes: 11 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
# include SeqAn libraries, don't warn about MD5 deprecation
CXXFLAGS+=-I. -Wno-deprecated-declarations

#include htslib by setting -I<path_to_htslib/include> and link to the htslib library
HTSDIR=<path_to_htsdir>
CXXFLAGS+=-I$(HTSDIR)/include

# RELEASE build
CXXFLAGS+= -O3 -DSEQAN_ENABLE_TESTING=0 -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_HAS_ZLIB=1
LDLIBS=-lz -lssl -lcrypto

# RELEASE build
CXX=g++ -std=c++11 -pthread
CXXFLAGS+= -O3 -DSEQAN_ENABLE_TESTING=0 -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_HAS_ZLIB=1
LDFLAGS=-L$(HTSDIR)/lib -lz -lssl -lcrypto -Wl,-rpath,$(HTSDIR)/lib -lhts

TARGET = bamhash_checksum_bam bamhash_checksum_fastq bamhash_checksum_fasta
all: $(TARGET)

bamhash_checksum_bam: bamhash_checksum_common.o bamhash_checksum_bam.o
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
$(CXX) $(LDFLAGS) -o $@ $^

bamhash_checksum_fastq: bamhash_checksum_common.o bamhash_checksum_fastq.o
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
$(CXX) $(LDFLAGS) -o $@ $^

bamhash_checksum_fasta: bamhash_checksum_common.o bamhash_checksum_fasta.o
$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)

$(CXX) $(LDFLAGS) -o $@ $^

clean:
$(RM) *.o *~ $(TARGET)
$(RM) *.o *~ $(TARGET)
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ composed of the readname, whether it is first or last in pair, sequence and qual
All the hash values are summed up so the result is independent of the ordering within the files.
The result can be compared to verify that the pair of FASTQ files contain the same read
information as the aligned BAM file.
The program is written in C++ and uses SeqAnHTS v1.0 for parsing FASTQ, gzip compressed FASTQ and BAM files.
SeqAnHTS is a fork of SeqAn library ( Döring etal. , 2008 ) that uses htslib to read SAM/BAM/CRAM files.

## Manuscript

Expand Down Expand Up @@ -35,6 +37,7 @@ Both multiline FASTA and FASTQ are supported and gzipped input for FASTA and FAS

~~~
bamhash_checksum_bam [OPTIONS] <in.bam> <in2.bam> ...
bamhash_checksum_bam [OPTIONS] -r <reference-file> <in.cram>
~~~

processes a number of BAM files. BAM files are assumed to contain paired end reads. If you run with `--no-paired` it treats all reads as single end and displays a warning if any read is marked as "second in pair" in the BAM file.
Expand All @@ -57,4 +60,8 @@ processes a number of FASTA files. All FASTA files are assumed to be single end

## Compiling

The only external dependency is on OpenSSL for the MD5 implementation.
External dependencies are on:
OpenSSL for the MD5 implementation
htslib library (version 1.9)


179 changes: 119 additions & 60 deletions bamhash_checksum_bam.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "htslib/hts.h"
#include <seqan/bam_io.h>
#include <iostream>
#include <seqan/stream.h>
#include <seqan/bam_io.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
Expand All @@ -9,26 +10,29 @@
#include <stdint.h>
#include <vector>
#include <seqan/arg_parse.h>
#include <seqan/hts_io.h>


#include "bamhash_checksum_common.h"

#include "bamhash_checksum_common.h"

/** only needed for seqan 1.4.1 and lower
inline bool
hasFlagSupplementary(seqan::BamAlignmentRecord const & record)
{
return (record.flag & 0x0800) == 0x0800;
}
*/
struct Baminfo {
std::vector<std::string> bamfiles;
bool debug;
bool noReadNames;
bool noQuality;
bool paired;
seqan::CharString reference;

Baminfo() : debug(false), noReadNames(false), noQuality(false), paired(true) {}
Baminfo() : debug(false), noReadNames(false), noQuality(false), paired(true), reference("") {}

};

struct Counts {
uint64_t sum;
uint64_t count;

Counts() : sum(0), count(0) {}

};

Expand All @@ -38,24 +42,27 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
seqan::ArgumentParser parser("bamhash_checksum_bam");
//readlink("/proc/self/exe", options.bindir, sizeof(options.bindir)-1);

setShortDescription(parser, "Checksum of a bam file"); //TODO change description
setShortDescription(parser, "Checksum of a sam, bam or cram file"); //TODO change description
setVersion(parser, BAMHASH_VERSION);
setDate(parser, "May 2015");
setDate(parser, "Oct 2018");

addUsageLine(parser, "[\\fIOPTIONS\\fP] \\fI<in.bam> <in2.bam> ...\\fP");
addDescription(parser, "Program for checksum of sequence reads. ");

addArgument(parser, seqan::ArgParseArgument(seqan::ArgParseArgument::INPUTFILE,"bamfile", "False", 1));

setValidValues(parser, 0,"bam sam");
addArgument(parser, seqan::ArgParseArgument(seqan::ArgParseArgument::INPUT_FILE,"bamfile", "False", 1));
setValidValues(parser, 0,"sam bam cram");

addSection(parser, "Options");

//add debug option:
addOption(parser, seqan::ArgParseOption("d", "debug", "Debug mode. Prints full hex for each read to stdout"));
addOption(parser, seqan::ArgParseOption("R", "no-readnames", "Do not use read names as part of checksum"));
addOption(parser, seqan::ArgParseOption("Q", "no-quality", "Do not use read quality as part of checksum"));
addOption(parser, seqan::ArgParseOption("P", "no-paired", "Bam files were not generated with paired-end reads"));
addOption(parser, seqan::ArgParseOption("P", "no-paired", "Cram files were not generated with paired-end reads"));
addOption(parser, seqan::ArgParseOption("r", "reference-file", "Path to reference-file if reference not given in header",
seqan::ArgParseArgument::INPUT_FILE));

setValidValues(parser, "reference-file", "fa");

// Parse command line.
seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
Expand All @@ -67,13 +74,76 @@ parseCommandLine(Baminfo& options, int argc, char const **argv) {
options.noReadNames = isSet(parser, "no-readnames");
options.noQuality = isSet(parser, "no-quality");
options.paired = !isSet(parser, "no-paired");
getOptionValue(options.reference, parser, "reference-file");

options.bamfiles = getArgumentValues(parser, 0);

return seqan::ArgumentParser::PARSE_OK;
}


// -----------------------------------------------------------------------------
// FUNCTION getSampleIdAndLaneNames()
// -----------------------------------------------------------------------------

void getLaneNames(std::map<seqan::CharString, unsigned> & laneNames, std::string const & header)
{
for (int i = 0; i < header.size(); /*empty on purpose*/)
{
auto hdr_find_it = std::find(header.begin() + i, header.end(), '\n');
std::string line = header.substr(i, hdr_find_it - header.begin() - i);

if (line.size() > 7 && line[0] == '@' && line[1] == 'R' && line[2] == 'G' && line[3] == '\t')
{
for (int j = 0; j < static_cast<int>(line.size()); /*empty on purpose*/)
{
auto line_find_it = std::find(line.begin() + j, line.end(), '\t');
std::string field = line.substr(j, line_find_it - line.begin() - j);

if (field.size() > 3 && field[0] == 'I' && field[1] == 'D' && field[2] == ':')
{
seqan::CharString read_group_name = field.substr(3);
int read_group_index = laneNames.size();
laneNames[read_group_name] = read_group_index;
}

j = std::distance(line.begin(), line_find_it) + 1;
}
}

i = std::distance(header.begin(), hdr_find_it) + 1;
}
}

// -----------------------------------------------------------------------------
// FUNCTION getLane()
// -----------------------------------------------------------------------------

int getLane(seqan::BamAlignmentRecord & record,
seqan::BamTagsDict & tagsDict,
std::map<seqan::CharString, unsigned> & laneNames)
{
unsigned tagIdx = 0;

if (!seqan::findTagKey(tagIdx, tagsDict, "RG"))
{
std::cerr << "ERROR: Found a read with a missing read group (RG) tag\n";
return -1;
}

seqan::CharString read_group;

if(!seqan::extractTagValue(read_group, tagsDict, tagIdx))
{
std::cerr << "ERROR: Failed to extract read group (RG) tag value\n";
return -1;
}

return laneNames[read_group];
}



int main(int argc, char const **argv) {

Baminfo info; // Define structure variable
Expand All @@ -83,58 +153,48 @@ int main(int argc, char const **argv) {
return res == seqan::ArgumentParser::PARSE_ERROR;
}

uint64_t sum = 0;
uint64_t count = 0;
//Moving below to struct
// uint64_t sum = 0;
// uint64_t count = 0;
bool pairedWarning = false;


//adding new stuff
std::map<seqan::CharString, unsigned> laneNames;
// Initialize all counts for each lane.
seqan::String<Counts> counts;

for (int i = 0; i < info.bamfiles.size(); i++) {
// Open BGZF Stream for reading.
seqan::Stream<seqan::Bgzf> inStream;

const char* bamfile = info.bamfiles[i].c_str();

if (!open(inStream, bamfile, "r")) {
std::cerr << "ERROR: Could not open " << bamfile << " for reading.\n";
return 1;
}
const char* reference = toCString(info.reference);

// Setup name store, cache, and BAM I/O context.
typedef seqan::StringSet<seqan::CharString> TNameStore;
typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
typedef seqan::BamIOContext<TNameStore> TBamIOContext;
TNameStore nameStore;
TNameStoreCache nameStoreCache(nameStore);
TBamIOContext context(nameStore, nameStoreCache);

// Read header.
seqan::BamHeader header;
if (readRecord(header, context, inStream, seqan::Bam()) != 0) {
std::cerr << "ERROR: Could not read header from BAM file " << bamfile << "\n";
return 1;
}
seqan::clear(header);
// Open stream for reading
seqan::HtsFile inStream(bamfile, "r", reference);

// Initialize lane names (read groups).
std::string header(inStream.hdr->text, inStream.hdr->l_text);
getLaneNames(laneNames, header);
unsigned lanecount = laneNames.size();
resize(counts, lanecount);

// Define:
seqan::BamAlignmentRecord record;
seqan::CharString string2hash;
//char hexCstr[33];

// Read record
while (!atEnd(inStream)) {
if (readRecord(record, context, inStream, seqan::Bam()) != 0) {
std::cerr << "ERROR: Could not read record from BAM File " << bamfile << "\n";
return 1;
}
while (seqan::readRecord(record, inStream)){
seqan::BamTagsDict tagsDict(record.tags);
int l = getLane(record, tagsDict, laneNames);
if (l == -1) return 1;

// Check if flag: reverse complement and change record accordingly
if (hasFlagRC(record)) {
reverseComplement(record.seq);
reverse(record.qual);
seqan::reverseComplement(record.seq);
seqan::reverse(record.qual);
}
// Check if flag: supplementary and exclude those
if (!hasFlagSupplementary(record) && !hasFlagSecondary(record)) {
count +=1;
counts[l].count +=1;
// Construct one string from record
if (!info.noReadNames) {
seqan::append(string2hash, record.qName);
Expand All @@ -143,7 +203,7 @@ int main(int argc, char const **argv) {
seqan::append(string2hash, "/2");
} else {
if (!pairedWarning) {
std::cerr << "WARNING: BamHash was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
std::cerr << "WARNING: seqread was run with --no-paired mode, but BAM file has reads marked as second pair" << std::endl;
pairedWarning = true;
}
seqan::append(string2hash, "/1");
Expand All @@ -162,26 +222,25 @@ int main(int argc, char const **argv) {
// Get MD5 hash
hash_t hex = str2md5(toCString(string2hash), length(string2hash));


if (info.debug) {
std::cout << string2hash << " " << std::hex << hex.p.low << "\n";
} else {
hexSum(hex, sum);
hexSum(hex, counts[l].sum);
}

seqan::clear(string2hash);
}
}

// print result

}

if (!info.debug) {
std::cout << std::hex << sum << "\t";
std::cout << std::dec << count << "\n";
for (std::map<seqan::CharString, unsigned>::iterator it = laneNames.begin(); it != laneNames.end(); ++it) {
std::cout << it->first << "\t";
int lid = it->second;
std::cout << std::hex << counts[lid].sum << "\t";
std::cout << std::dec << counts[lid].count << "\n";
}
}

return 0;
}

2 changes: 1 addition & 1 deletion bamhash_checksum_common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef BAMHASH_CHECKSUM_COMMON_H
#define BAMHASH_CHECKSUM_COMMON_H

#define BAMHASH_VERSION "1.1"
#define BAMHASH_VERSION "1.3"

#include <string>
#include <stdint.h>
Expand Down
Loading

0 comments on commit de6d8d9

Please sign in to comment.