diff --git a/CMakeLists.txt b/CMakeLists.txt index 561a375..ab37515 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,8 @@ endif () if (${CMAKE_BUILD_TYPE} MATCHES Debug) set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O -g") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O -g") + add_compile_options(-fsanitize=address) + add_link_options(-fsanitize=address) else() # Use all standard-compliant optimizations - always add these: set (CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${PIC_FLAG} ${EXTRA_FLAGS}") diff --git a/src/common/progress.hpp b/src/common/progress.hpp index e63e531..fd9d5b9 100644 --- a/src/common/progress.hpp +++ b/src/common/progress.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include diff --git a/src/common/utils.hpp b/src/common/utils.hpp index f5646dc..119be9d 100644 --- a/src/common/utils.hpp +++ b/src/common/utils.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index ff5d5f0..3bdc4ed 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -10,6 +10,7 @@ #include #include #include +#include "common/progress.hpp" namespace skch { @@ -165,6 +166,7 @@ namespace skch //--for split read mapping + long double kmerComplexity; // Estimated sequence complexity int n_merged; // how many mappings we've merged into this one offset_t splitMappingId; // To identify split mappings that are chained uint8_t discard; // set to 1 for deletion @@ -204,10 +206,11 @@ namespace skch //Container to save copy of kseq object struct InputSeqContainer { - seqno_t seqCounter; //sequence counter - offset_t len; //sequence length - std::string seq; //sequence string - std::string seqName; //sequence id + seqno_t seqCounter; //sequence counter + offset_t len; //sequence length + std::string seq; //sequence string + std::string seqName; //sequence id + /* * @brief constructor @@ -216,12 +219,30 @@ namespace skch * @param[in] len length of sequence */ InputSeqContainer(const std::string& s, const std::string& id, seqno_t seqcount) - : seq(s) - , seqName(id) + : seqCounter(seqcount) , len(s.length()) - , seqCounter(seqcount) { } + , seq(s) + , seqName(id) { } + }; + + struct InputSeqProgContainer : InputSeqContainer + { + using InputSeqContainer::InputSeqContainer; + progress_meter::ProgressMeter& progress; //progress meter (shared) + + + /* + * @brief constructor + * @param[in] kseq_seq complete read or reference sequence + * @param[in] kseq_id sequence id name + * @param[in] len length of sequence + */ + InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm) + : InputSeqContainer(s, id, seqcount) + , progress(pm) { } }; + //Output type of map function struct MapModuleOutput { @@ -248,6 +269,8 @@ namespace skch std::string seqName; //sequence name MinmerVec minmerTableQuery; //Vector of minmers in the query MinmerVec seedHits; //Vector of minmers in the reference + int refGroup; //Prefix group of sequence + float kmerComplexity; //Estimated sequence complexity }; } diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp index 76946ff..4ee72a2 100644 --- a/src/map/include/commonFunc.hpp +++ b/src/map/include/commonFunc.hpp @@ -186,9 +186,25 @@ namespace skch { ankerl::unordered_dense::map sketched_vals; std::vector sketched_heap; sketched_heap.reserve(sketchSize+1); + + // Get distance until last "N" + int ambig_kmer_count = 0; + for (int i = kmerSize - 1; i >= 0; i--) + { + if (seq[i] == 'N') + { + ambig_kmer_count = i+1; + break; + } + } for(offset_t i = 0; i < len - kmerSize + 1; i++) { + + if (seq[i+kmerSize-1] == 'N') + { + ambig_kmer_count = kmerSize; + } //Hash kmers hash_t hashFwd = CommonFunc::getHash(seq + i, kmerSize); hash_t hashBwd; @@ -199,7 +215,7 @@ namespace skch { hashBwd = std::numeric_limits::max(); //Pick a dummy high value so that it is ignored later //Consider non-symmetric kmers only - if(hashBwd != hashFwd) + if(hashBwd != hashFwd && ambig_kmer_count == 0) { //Take minimum value of kmer and its reverse complement hash_t currentKmer = std::min(hashFwd, hashBwd); @@ -237,6 +253,10 @@ namespace skch { } } } + if (ambig_kmer_count > 0) + { + ambig_kmer_count--; + } } minmerIndex.resize(sketched_heap.size()); @@ -293,6 +313,10 @@ namespace skch { //if(alphabetSize == 4) //not protein //CommonFunc::reverseComplement(seq, seqRev.get(), len); + + // Get distance until last "N" + int ambig_kmer_count = 0; + for(offset_t i = 0; i < len - kmerSize + 1; i++) { @@ -369,8 +393,12 @@ namespace skch { Q.pop_front(); } + if (seq[i+kmerSize-1] == 'N') + { + ambig_kmer_count = kmerSize; + } //Consider non-symmetric kmers only - if(hashBwd != hashFwd) + if(hashBwd != hashFwd && ambig_kmer_count == 0) { // Add current hash to window Q.push_back(std::make_tuple(currentKmer, currentStrand, i)); @@ -399,6 +427,11 @@ namespace skch { std::push_heap(heapWindow.begin(), heapWindow.end(), KIHeap_cmp); } } + if (ambig_kmer_count > 0) + { + ambig_kmer_count--; + } + diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 38445db..e9df69f 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -7,6 +7,7 @@ #ifndef SKETCH_MAP_HPP #define SKETCH_MAP_HPP +#include #include #include #include @@ -106,6 +107,10 @@ namespace skch //for an L1 candidate if the best intersection size is i; std::vector sketchCutoffs; + //Vector for obtaining group from refId + //if refIdGroup[i] == refIdGroup[j], then sequence i and j have the same prefix; + std::vector refIdGroup; + public: /** @@ -117,18 +122,58 @@ namespace skch Map(const skch::Parameters &p, const skch::Sketch &refsketch, PostProcessResultsFn_t f = nullptr) : param(p), - sketchCutoffs(p.sketchSize + 1, 1), refSketch(refsketch), - processMappingResults(f) + processMappingResults(f), + sketchCutoffs(p.sketchSize + 1, 1), + refIdGroup(refsketch.metadata.size()) { if (p.stage1_topANI_filter) { this->setProbs(); } + if (p.skip_prefix) + { + this->setRefGroups(); + } this->mapQuery(); } private: + // Sets the groups of reference contigs based on prefix + void setRefGroups() + { + int group = 0; + int start_idx = 0; + int idx = 0; + while (start_idx < this->refSketch.metadata.size()) + { + const auto currPrefix = prefix(this->refSketch.metadata[start_idx].name, param.prefix_delim); + idx = start_idx; + while (idx < this->refSketch.metadata.size() + && currPrefix == prefix(this->refSketch.metadata[idx].name, param.prefix_delim)) + { + this->refIdGroup[idx++] = group; + } + group++; + start_idx = idx; + } + } + + // Gets the ref group of a query based on the prefix + int getRefGroup(const std::string& seqName) + { + const auto queryPrefix = prefix(seqName, param.prefix_delim); + for (int i = 0; i < this->refSketch.metadata.size(); i++) + { + const auto currPrefix = prefix(this->refSketch.metadata[i].name, param.prefix_delim); + if (queryPrefix == currPrefix) + { + return this->refIdGroup[i]; + } + } + // Doesn't belong to any ref group + return -1; + } void setProbs() { @@ -226,7 +271,7 @@ namespace skch MappingResultsVector_t allReadMappings; //Aggregate mapping results for the complete run //Create the thread pool - ThreadPool threadPool( [this](InputSeqContainer* e){return mapModule(e);}, param.threads); + ThreadPool threadPool( [this](InputSeqProgContainer* e){return mapModule(e);}, param.threads); // kind've expensive if the fasta index is not available for the query sequences, // but it can help people know how long we're going to take @@ -291,14 +336,14 @@ namespace skch { totalReadsPickedForMapping++; //Dispatch input to thread - threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter)); + threadPool.runWhenThreadAvailable(new InputSeqProgContainer(seq, seq_name, seqCounter, progress)); //Collect output if available while ( threadPool.outputAvailable() ) { mapModuleHandleOutput(threadPool.popOutputWhenAvailable(), allReadMappings, totalReadsMapped, outstrm, progress); } } - progress.increment(seq.size()/2); + //progress.increment(seq.size()/2); seqCounter++; }); //Finish reading query input file @@ -431,13 +476,69 @@ namespace skch } } + /** + * @brief helper to main filtering function + * @details filters mappings by group + * @param[in] input unfiltered mappings + * @param[in] output filtered mappings + * @param[in] input num mappings per segment + * @return void + */ + void filterByGroup( + MappingResultsVector_t &unfilteredMappings, + MappingResultsVector_t &filteredMappings, + int n_mappings) + { + filteredMappings.reserve(unfilteredMappings.size()); + + std::sort(unfilteredMappings.begin(), unfilteredMappings.end(), [](const auto& a, const auto& b) + { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); + auto subrange_begin = unfilteredMappings.begin(); + auto subrange_end = unfilteredMappings.begin(); + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) + { + while (subrange_end != unfilteredMappings.end()) + { + if (param.skip_prefix) + { + int currGroup = this->refIdGroup[subrange_begin->refSeqId]; + subrange_end = std::find_if_not(subrange_begin, unfilteredMappings.end(), [this, currGroup] (const auto& unfilteredMappings_candidate) { + return currGroup == this->refIdGroup[unfilteredMappings_candidate.refSeqId]; + }); + } + else + { + subrange_end = unfilteredMappings.end(); + } + // TODO why are we filtering these before merging? + std::vector tmpMappings(std::distance(subrange_begin, subrange_end)); + std::move(subrange_begin, subrange_end, tmpMappings.begin()); + std::sort(tmpMappings.begin(), tmpMappings.end(), [](const auto& a, const auto& b) + { return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); }); + skch::Filter::query::filterMappings(tmpMappings, n_mappings); + std::move(tmpMappings.begin(), tmpMappings.end(), std::back_inserter(filteredMappings)); + subrange_begin = subrange_end; + } + } + //Sort the mappings by query (then reference) position + std::sort( + filteredMappings.begin(), filteredMappings.end(), + [](const MappingResult &a, const MappingResult &b) { + return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) + < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); + //return std::tie(a.refSeqId, a.refStartPos, a.queryStartPos) + //< std::tie(b.refSeqId, b.refStartPos, b.queryStartPos); + }); + } + + /** * @brief main mapping function given an input read * @details this function is run in parallel by multiple threads * @param[in] input input read details * @return output object containing the mappings */ - MapModuleOutput* mapModule (InputSeqContainer* input) + MapModuleOutput* mapModule (InputSeqProgContainer* input) { MapModuleOutput* output = new MapModuleOutput(); @@ -447,13 +548,14 @@ namespace skch bool split_mapping = true; std::vector intervalPoints; // Reserve the "expected" number of interval points - intervalPoints.reserve(2 * param.sketchSize * refSketch.minmerIndex.size() / refSketch.minmerPosLookupIndex.size()); - + intervalPoints.reserve( + 2 * param.sketchSize * refSketch.minmerIndex.size() / refSketch.minmerPosLookupIndex.size()); std::vector l1Mappings; - MappingResultsVector_t l2Mappings; + MappingResultsVector_t unfilteredMappings; + int refGroup = this->getRefGroup(input->seqName); - if(! param.split || input->len <= param.segLength) + if(! param.split || input->len <= param.segLength || input->len <= param.block_length) { QueryMetaData Q; Q.seq = &(input->seq)[0u]; @@ -461,15 +563,18 @@ namespace skch Q.fullLen = input->len; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; + Q.refGroup = refGroup; //Map this sequence mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); // save the output - output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end()); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); // indicate that we mapped full length split_mapping = false; + + input->progress.increment(input->len); } else //Split read mapping { @@ -485,6 +590,7 @@ namespace skch Q.fullLen = input->len; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; + Q.refGroup = refGroup; intervalPoints.clear(); l1Mappings.clear(); @@ -501,7 +607,8 @@ namespace skch }); // save the output - output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end()); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + input->progress.increment(param.segLength); } //Map last overlapping fragment to cover the whole read @@ -513,6 +620,7 @@ namespace skch Q.len = param.segLength; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; + Q.refGroup = refGroup; intervalPoints.clear(); l1Mappings.clear(); @@ -528,7 +636,9 @@ namespace skch e.queryEndPos = input->len; }); - output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end()); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + + input->progress.increment(input->len % param.segLength); } } @@ -537,25 +647,38 @@ namespace skch param.numMappingsForShortSequence : param.numMappingsForSegment) - 1; - if (split_mapping) { - if (param.mergeMappings) { + if (split_mapping) + { + if (param.mergeMappings) + { // hardcore merge using the chain gap - //mergeMappingsInRange(output->readMappings, param.chain_gap); - mergeMappings(output->readMappings); - if (input->len >= param.block_length) { + mergeMappingsInRange(unfilteredMappings, param.chain_gap); + //mergeMappings(unfilteredMappings); + if (input->len >= param.block_length) + { // remove short chains that didn't exceed block length - filterWeakMappings(output->readMappings, std::floor(param.block_length / param.segLength)); + filterWeakMappings(unfilteredMappings, std::floor(param.block_length / param.segLength)); } } } - // TODO why are we filtering these before merging? - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { - skch::Filter::query::filterMappings(output->readMappings, n_mappings); + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + MappingResultsVector_t tempMappings; + tempMappings.reserve(output->readMappings.size()); + filterByGroup(unfilteredMappings, tempMappings, n_mappings); + std::swap(tempMappings, unfilteredMappings); } + std::swap(output->readMappings, unfilteredMappings); + // remove self-mode don't-maps this->filterSelfingLongToShorts(output->readMappings); + + // remove alignments where the ratio between query and target length is < our identity threshold + if (param.filterLengthMismatches) + { + this->filterFalseHighIdentity(output->readMappings); + } //Make sure mapping boundary don't exceed sequence lengths this->mappingBoundarySanityCheck(input, output->readMappings); @@ -594,7 +717,7 @@ namespace skch reportReadMappings(output->readMappings, output->qseqName, outstrm); } - progress.increment(output->qseqLen/2 + (output->qseqLen % 2 != 0)); + //progress.increment(output->qseqLen/2 + (output->qseqLen % 2 != 0)); delete output; } @@ -622,9 +745,36 @@ namespace skch auto t1 = skch::Time::now(); #endif - //L2 Mapping - doL2Mapping(Q, l1Mappings, l2Mappings); + auto l1_begin = l1Mappings.begin(); + auto l1_end = l1Mappings.begin(); + while (l1_end != l1Mappings.end()) + { + if (param.skip_prefix) + { + int currGroup = this->refIdGroup[l1_begin->seqId]; + l1_end = std::find_if_not(l1_begin, l1Mappings.end(), [this, currGroup] (const auto& candidate) { + return currGroup == this->refIdGroup[candidate.seqId]; + }); + } + else + { + l1_end = l1Mappings.end(); + } + + //Sort L1 windows based on intersection size if using hg filter + if (param.stage1_topANI_filter) + { + std::make_heap(l1_begin, l1_end, L1_locus_intersection_cmp); + } + doL2Mapping(Q, l1_begin, l1_end, l2Mappings); + // Set beginning of next range + l1_begin = l1_end; + } + + // Sort output mappings + std::sort(l2Mappings.begin(), l2Mappings.end(), [](const auto& a, const auto& b) + { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); #ifdef ENABLE_TIME_PROFILE_L1_L2 { @@ -653,6 +803,9 @@ namespace skch #ifdef DEBUG int orig_len = Q.minmerTableQuery.size(); #endif + const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits::max(); + Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); + // TODO remove them from the original sketch instead of removing for each read auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) { return refSketch.isFreqSeed(mi.hash); @@ -710,8 +863,12 @@ namespace skch while(!pq.empty()) { const IP_const_iterator ip_it = pq.front().it; - if (!param.skip_self || Q.seqName != this->refSketch.metadata[ip_it->seqId].name) - { + const auto& ref = this->refSketch.metadata[ip_it->seqId]; + if ((!param.skip_prefix && !param.skip_self) + || ((Q.fullLen <= ref.len) + && ((param.skip_self && Q.seqName != ref.name) + || (param.skip_prefix && this->refIdGroup[ip_it->seqId] != Q.refGroup))) + ) { intervalPoints.push_back(*ip_it); } std::pop_heap(pq.begin(), pq.end(), heap_cmp); @@ -732,8 +889,13 @@ namespace skch } - template - void computeL1CandidateRegions(Q_Info &Q, Vec1& intervalPoints, int minimumHits, Vec2 &l1Mappings) + template + void computeL1CandidateRegions( + Q_Info &Q, + IP_iter ip_begin, + IP_iter ip_end, + int minimumHits, + Vec2 &l1Mappings) { #ifdef DEBUG std::cerr << "INFO, skch::Map:computeL1CandidateRegions, read id " << Q.seqCounter << std::endl; @@ -746,8 +908,8 @@ namespace skch // Keep track of all minmer windows that intersect with [i, i+windowLen] int windowLen = std::max(0, Q.len - param.segLength); - auto trailingIt = intervalPoints.begin(); - auto leadingIt = intervalPoints.begin(); + auto trailingIt = ip_begin; + auto leadingIt = ip_begin; // Group together local sketch intersection maximums that are within clusterLen of eachother // @@ -761,11 +923,11 @@ namespace skch std::unordered_map hash_to_freq; if (param.stage1_topANI_filter) { - while (leadingIt != intervalPoints.end()) + while (leadingIt != ip_end) { // Catch the trailing iterator up to the leading iterator - windowLen while ( - trailingIt != intervalPoints.end() + trailingIt != ip_end && ((trailingIt->seqId == leadingIt->seqId && trailingIt->pos <= leadingIt->pos - windowLen) || trailingIt->seqId < leadingIt->seqId)) { @@ -779,7 +941,7 @@ namespace skch trailingIt++; } auto currentPos = leadingIt->pos; - while (leadingIt != intervalPoints.end() && leadingIt->pos == currentPos) { + while (leadingIt != ip_end && leadingIt->pos == currentPos) { if (leadingIt->side == side::OPEN) { if (windowLen == 0 || hash_to_freq[leadingIt->hash] == 0) { overlapCount++; @@ -820,8 +982,8 @@ namespace skch bool in_candidate = false; L1_candidateLocus_t l1_out = {}; - trailingIt = intervalPoints.begin(); - leadingIt = intervalPoints.begin(); + trailingIt = ip_begin; + leadingIt = ip_begin; // Keep track of 3 consecutive points so that we can track local optimums overlapCount = 0; @@ -832,7 +994,7 @@ namespace skch SeqCoord prevPos, currentPos; - while (leadingIt != intervalPoints.end()) + while (leadingIt != ip_end) { prevPrevOverlap = prevOverlap; prevOverlap = overlapCount; @@ -842,7 +1004,7 @@ namespace skch // right now, this basically happens since every CLOSE should be an OPEN. // This doesn't invalidate the logic, just potentially wastes time while ( - trailingIt != intervalPoints.end() + trailingIt != ip_end && ((trailingIt->seqId == leadingIt->seqId && trailingIt->pos <= leadingIt->pos - windowLen) || trailingIt->seqId < leadingIt->seqId)) { @@ -859,7 +1021,7 @@ namespace skch prevPos = currentPos; currentPos = SeqCoord{leadingIt->seqId, leadingIt->pos}; } - while (leadingIt != intervalPoints.end() && leadingIt->pos == currentPos.pos) + while (leadingIt != ip_end && leadingIt->pos == currentPos.pos) { if (leadingIt->side == side::OPEN) { if (windowLen == 0 || hash_to_freq[leadingIt->hash] == 0) { @@ -953,12 +1115,26 @@ namespace skch //3. Compute L1 windows int minimumHits = Stat::estimateMinimumHitsRelaxed(Q.sketchSize, param.kmerSize, param.percentageIdentity, skch::fixed::confidence_interval); - computeL1CandidateRegions(Q, intervalPoints, minimumHits, l1Mappings); - //4. Sort L1 windows based on intersection size if using hg filter - if (param.stage1_topANI_filter) + // For each "group" + auto ip_begin = intervalPoints.begin(); + auto ip_end = intervalPoints.begin(); + while (ip_end != intervalPoints.end()) { - std::make_heap(l1Mappings.begin(), l1Mappings.end(), L1_locus_intersection_cmp); + if (param.skip_prefix) + { + int currGroup = this->refIdGroup[ip_begin->seqId]; + ip_end = std::find_if_not(ip_begin, intervalPoints.end(), [this, currGroup] (const auto& ip) { + return currGroup == this->refIdGroup[ip.seqId]; + }); + } + else + { + ip_end = intervalPoints.end(); + } + computeL1CandidateRegions(Q, ip_begin, ip_end, minimumHits, l1Mappings); + + ip_begin = ip_end; } } @@ -975,20 +1151,14 @@ namespace skch * @param[in] l1Mappings candidate regions for query sequence found at L1 * @param[out] l2Mappings Mapping results in the L2 stage */ - template - void doL2Mapping(Q_Info &Q, VecIn &l1Mappings, VecOut &l2Mappings) + template + void doL2Mapping(Q_Info &Q, L1_Iter l1_begin, L1_Iter l1_end, VecOut &l2Mappings) { ///2. Walk the read over the candidate regions and compute the jaccard similarity with minimum s sketches std::vector l2_vec; - if (!param.stage1_topANI_filter) - { - // Only reserve if we know that we'll be storing each of the l1 mappings - l2Mappings.reserve(l1Mappings.size()); - } - double bestJaccardNumerator = 0; - auto loc_iterator = l1Mappings.begin(); - while (loc_iterator != l1Mappings.end()) + auto loc_iterator = l1_begin; + while (loc_iterator != l1_end) { L1_candidateLocus_t& candidateLocus = *loc_iterator; @@ -996,9 +1166,9 @@ namespace skch { // If using HG filter, don't consider any mappings which have no chance of being // within param.ANIDiff of the best mapping seen so far - double cutoff_ani = (1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff; + double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff)); double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize); - if (float(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) + if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) { break; } @@ -1021,12 +1191,9 @@ namespace skch // if we are in all-vs-all mode, it isn't a self-mapping, // and if we are self-mapping, the query is shorter than the target const auto& ref = this->refSketch.metadata[l2.seqId]; - if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity - || nucIdentity >= param.percentageIdentity) - && !(param.skip_self && Q.seqName == ref.name) - && !(param.skip_prefix - && prefix(Q.seqName, param.prefix_delim) - == prefix(ref.name, param.prefix_delim))) + if((Q.kmerComplexity >= param.kmerComplexityThreshold) + && ((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity) + || nucIdentity >= param.percentageIdentity)) { //Track the best jaccard numerator bestJaccardNumerator = std::max(bestJaccardNumerator, l2.sharedSketchSize); @@ -1049,6 +1216,7 @@ namespace skch res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos); res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0); res.strand = l2.strand; + res.kmerComplexity = Q.kmerComplexity; res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len); @@ -1059,23 +1227,17 @@ namespace skch if (param.stage1_topANI_filter) { - std::pop_heap(l1Mappings.begin(), l1Mappings.end(), L1_locus_intersection_cmp); - l1Mappings.pop_back(); + std::pop_heap(l1_begin, l1_end, L1_locus_intersection_cmp); + l1_end--; //"Pop back" } else { loc_iterator++; } } - std::sort(l2Mappings.begin(), l2Mappings.end(), [](auto& a, auto& b) - { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); //std::cerr << "For an segment with " << l1Mappings.size() //<< " L1 mappings " //<< " there were " << l2Mappings.size() << " L2 mappings\n"; - -#ifdef DEBUG - std::cerr << "[mashmap::skch::Map:doL2Mapping] read id " << Q.seqCounter << ", count of L1 candidates= " << l1Mappings.size() << ", count of L2 candidates= " << l2Mappings.size() << std::endl; -#endif } /** @@ -1366,6 +1528,10 @@ namespace skch it->nucIdentity = ( std::accumulate(it, it_end, 0.0, [](double x, MappingResult &e){ return x + e.nucIdentity; }) )/ std::distance(it, it_end); + //Mean sequence complexity of all mappings in the chain + it->kmerComplexity = ( std::accumulate(it, it_end, 0.0, + [](double x, MappingResult &e){ return x + e.kmerComplexity; }) )/ std::distance(it, it_end); + //Discard other mappings of this chain std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; }); @@ -1489,13 +1655,7 @@ namespace skch it->nucIdentity = ( std::accumulate( it, it_end, 0.0, [](double x, MappingResult &e){ return x + e.nucIdentity; }) - ) /// it->n_merged; // this would scale directly by the number of mappings in the chain - // this scales slightly by the amount of missing segments - / ( (double)it->n_merged - //+ std::pow( - //std::log((double)it->blockLength / param.segLength), - //0.01) - ); + ) / it->n_merged; // this would scale directly by the number of mappings in the chain //Discard other mappings of this chain @@ -1519,7 +1679,7 @@ namespace skch * @param[in/out] readMappings Mappings computed by Mashmap (L2 stage) for a read */ template - void mappingBoundarySanityCheck(InputSeqContainer* input, VecIn &readMappings) + void mappingBoundarySanityCheck(InputSeqProgContainer* input, VecIn &readMappings) { for(auto &e : readMappings) { @@ -1589,7 +1749,8 @@ namespace skch outstrm << sep << e.conservedSketches << sep << e.blockLength << sep << fakeMapQ - << sep << "id:f:" << e.nucIdentity; + << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity + << sep << "kc:f:" << e.kmerComplexity; if (!param.mergeMappings) { outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 22c4512..bdba569 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -58,6 +58,10 @@ struct Parameters char prefix_delim; //the prefix delimiter bool mergeMappings; //if we should merge consecutive segment mappings bool keep_low_pct_id; //true if we should keep mappings whose estimated identity < percentageIdentity + bool report_ANI_percentage; //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash + bool filterLengthMismatches; //true if filtering out length mismatches + float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0) + int sketchSize; bool use_spaced_seeds; // @@ -89,7 +93,7 @@ float confidence_interval = 0.95; // Confidence interval to re float percentage_identity = 0.85; // Percent identity in the mapping step float ANIDiff = 0.0; // Stage 1 ANI diff threshold float ANIDiffConf = 0.999; // ANI diff confidence -std::string VERSION = "3.0.5"; // Version of MashMap +std::string VERSION = "3.0.6"; // Version of MashMap } } diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 1ad5d4d..23904bc 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -95,6 +95,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOptionAlternative("kmer","k"); cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue); + cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue); //cmd.defineOption("shortenCandidateRegions", "Only compute rolling minhash score for small regions around positions where the intersection of reference and query minmers is locally maximal. Results in slighty faster runtimes at the cost of mapping placement and ANI prediction."); @@ -103,6 +104,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("hgFilterAniDiff", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", ArgvParser::OptionRequiresValue); cmd.defineOption("hgFilterConf", "Confidence value for the hypergeometric filtering [default: 0.999]", ArgvParser::OptionRequiresValue); + cmd.defineOption("filterLengthMismatches", "Filter mappings where the ratio of reference/query mapped lengths disagrees with the ANI threshold"); + cmd.defineOption("skipSelf", "skip self mappings when the query and target name is the same (for all-vs-all mode)"); cmd.defineOptionAlternative("skipSelf", "X"); @@ -123,6 +126,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOptionAlternative("filter_mode", "f"); cmd.defineOption("legacy", "MashMap2 output format"); + cmd.defineOption("reportPercentage", "Report predicted ANI values in [0, 100] instead of [0,1] (necessary for use with wfmash)"); } /** @@ -345,6 +349,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir //Do not expose the option to set protein alphabet in mashmap //parameters.alphabetSize = 20; + parameters.filterLengthMismatches = cmd.foundOption("filterLengthMismatches"); parameters.stage1_topANI_filter = !cmd.foundOption("noHgFilter"); if(cmd.foundOption("filter_mode")) @@ -506,6 +511,16 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir parameters.percentageIdentity = 0.85; str.clear(); + if(cmd.foundOption("kmerComplexity")) + { + str << cmd.optionValue("kmerComplexity"); + str >> parameters.kmerComplexityThreshold; + str.clear(); + } + else + parameters.kmerComplexityThreshold = 0.0; + str.clear(); + if (cmd.foundOption("hgFilterAniDiff")) { str << cmd.optionValue("hgFilterAniDiff"); @@ -596,6 +611,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir parameters.outFileName = "mashmap.out"; parameters.legacy_output = cmd.foundOption("legacy"); + parameters.report_ANI_percentage = cmd.foundOption("reportPercentage"); printCmdOptions(parameters);