From 671fc42945581002ae431e9e7cabbcc4f1087068 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Thu, 29 Jun 2023 15:13:12 -0500 Subject: [PATCH 01/15] Add option to report ANI as percentage in [0, 100] --- src/map/include/computeMap.hpp | 2 +- src/map/include/map_parameters.hpp | 2 ++ src/map/include/parseCmdArgs.hpp | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 38445db..75a71f5 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1589,7 +1589,7 @@ namespace skch outstrm << sep << e.conservedSketches << sep << e.blockLength << sep << fakeMapQ - << sep << "id:f:" << e.nucIdentity; + << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity; if (!param.mergeMappings) { outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 22c4512..cda710f 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -58,6 +58,8 @@ struct Parameters char prefix_delim; //the prefix delimiter bool mergeMappings; //if we should merge consecutive segment mappings bool keep_low_pct_id; //true if we should keep mappings whose estimated identity < percentageIdentity + bool report_ANI_percentage; //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash + int sketchSize; bool use_spaced_seeds; // diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 1ad5d4d..74e16c4 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -123,6 +123,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOptionAlternative("filter_mode", "f"); cmd.defineOption("legacy", "MashMap2 output format"); + cmd.defineOption("reportPercentage", "Report predicted ANI values in [0, 100] instead of [0,1] (necessary for use with wfmash)"); } /** @@ -596,6 +597,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir parameters.outFileName = "mashmap.out"; parameters.legacy_output = cmd.foundOption("legacy"); + parameters.report_ANI_percentage = cmd.foundOption("reportPercentage"); printCmdOptions(parameters); From d586a037df366229d3a3838053b4984fc9dd237c Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 4 Jul 2023 17:38:02 -0500 Subject: [PATCH 02/15] add missing include --- src/common/progress.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/progress.hpp b/src/common/progress.hpp index e63e531..fd9d5b9 100644 --- a/src/common/progress.hpp +++ b/src/common/progress.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include From 38f3e386cf10351a3d9d06fb450293b381e80f6c Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 4 Jul 2023 17:38:34 -0500 Subject: [PATCH 03/15] Add address sanitizer for debug --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 561a375..ab37515 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,8 @@ endif () if (${CMAKE_BUILD_TYPE} MATCHES Debug) set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O -g") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O -g") + add_compile_options(-fsanitize=address) + add_link_options(-fsanitize=address) else() # Use all standard-compliant optimizations - always add these: set (CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${PIC_FLAG} ${EXTRA_FLAGS}") From b9bf235e936cf376339fb04e17a62d288095bb93 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 4 Jul 2023 17:38:44 -0500 Subject: [PATCH 04/15] Working split-prefix filter --- src/map/include/base_types.hpp | 41 ++++- src/map/include/computeMap.hpp | 313 +++++++++++++++++++++++++-------- 2 files changed, 278 insertions(+), 76 deletions(-) diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index ff5d5f0..d346f14 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -10,6 +10,7 @@ #include #include #include +#include "common/progress.hpp" namespace skch { @@ -204,10 +205,11 @@ namespace skch //Container to save copy of kseq object struct InputSeqContainer { - seqno_t seqCounter; //sequence counter - offset_t len; //sequence length - std::string seq; //sequence string - std::string seqName; //sequence id + seqno_t seqCounter; //sequence counter + offset_t len; //sequence length + std::string seq; //sequence string + std::string seqName; //sequence id + /* * @brief constructor @@ -216,12 +218,36 @@ namespace skch * @param[in] len length of sequence */ InputSeqContainer(const std::string& s, const std::string& id, seqno_t seqcount) - : seq(s) - , seqName(id) + : seqCounter(seqcount) , len(s.length()) - , seqCounter(seqcount) { } + , seq(s) + , seqName(id) { } }; + struct InputSeqProgContainer + { + seqno_t seqCounter; //sequence counter + offset_t len; //sequence length + std::string seq; //sequence string + std::string seqName; //sequence id + progress_meter::ProgressMeter& progress; //progress meter (shared) + + + /* + * @brief constructor + * @param[in] kseq_seq complete read or reference sequence + * @param[in] kseq_id sequence id name + * @param[in] len length of sequence + */ + InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm) + : seqCounter(seqcount) + , len(s.length()) + , seq(s) + , seqName(id) + , progress(pm) { } + }; + + //Output type of map function struct MapModuleOutput { @@ -248,6 +274,7 @@ namespace skch std::string seqName; //sequence name MinmerVec minmerTableQuery; //Vector of minmers in the query MinmerVec seedHits; //Vector of minmers in the reference + int refGroup; //Prefix group of sequence }; } diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 75a71f5..ad8ecb0 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -106,6 +106,10 @@ namespace skch //for an L1 candidate if the best intersection size is i; std::vector sketchCutoffs; + //Vector for obtaining group from refId + //if refIdGroup[i] == refIdGroup[j], then sequence i and j have the same prefix; + std::vector refIdGroup; + public: /** @@ -117,18 +121,58 @@ namespace skch Map(const skch::Parameters &p, const skch::Sketch &refsketch, PostProcessResultsFn_t f = nullptr) : param(p), - sketchCutoffs(p.sketchSize + 1, 1), refSketch(refsketch), - processMappingResults(f) + processMappingResults(f), + sketchCutoffs(p.sketchSize + 1, 1), + refIdGroup(refsketch.metadata.size()) { if (p.stage1_topANI_filter) { this->setProbs(); } + if (p.skip_prefix) + { + this->setRefGroups(); + } this->mapQuery(); } private: + // Sets the groups of reference contigs based on prefix + void setRefGroups() + { + int group = 0; + int start_idx = 0; + int idx = 0; + while (start_idx < this->refSketch.metadata.size()) + { + const auto currPrefix = prefix(this->refSketch.metadata[start_idx].name, param.prefix_delim); + idx = start_idx; + while (idx < this->refSketch.metadata.size() + && currPrefix == prefix(this->refSketch.metadata[idx].name, param.prefix_delim)) + { + this->refIdGroup[idx++] = group; + } + group++; + start_idx = idx; + } + } + + // Gets the ref group of a query based on the prefix + int getRefGroup(const std::string& seqName) + { + const auto queryPrefix = prefix(seqName, param.prefix_delim); + for (int i = 0; i < this->refSketch.metadata.size(); i++) + { + const auto currPrefix = prefix(this->refSketch.metadata[i].name, param.prefix_delim); + if (queryPrefix == currPrefix) + { + return this->refIdGroup[i]; + } + } + // Doesn't belong to any ref group + return -1; + } void setProbs() { @@ -226,7 +270,7 @@ namespace skch MappingResultsVector_t allReadMappings; //Aggregate mapping results for the complete run //Create the thread pool - ThreadPool threadPool( [this](InputSeqContainer* e){return mapModule(e);}, param.threads); + ThreadPool threadPool( [this](InputSeqProgContainer* e){return mapModule(e);}, param.threads); // kind've expensive if the fasta index is not available for the query sequences, // but it can help people know how long we're going to take @@ -291,14 +335,14 @@ namespace skch { totalReadsPickedForMapping++; //Dispatch input to thread - threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter)); + threadPool.runWhenThreadAvailable(new InputSeqProgContainer(seq, seq_name, seqCounter, progress)); //Collect output if available while ( threadPool.outputAvailable() ) { mapModuleHandleOutput(threadPool.popOutputWhenAvailable(), allReadMappings, totalReadsMapped, outstrm, progress); } } - progress.increment(seq.size()/2); + //progress.increment(seq.size()/2); seqCounter++; }); //Finish reading query input file @@ -431,13 +475,69 @@ namespace skch } } + /** + * @brief helper to main filtering function + * @details filters mappings by group + * @param[in] input unfiltered mappings + * @param[in] output filtered mappings + * @param[in] input num mappings per segment + * @return void + */ + void filterByGroup( + MappingResultsVector_t &unfilteredMappings, + MappingResultsVector_t &filteredMappings, + int n_mappings) + { + filteredMappings.reserve(unfilteredMappings.size()); + + std::sort(unfilteredMappings.begin(), unfilteredMappings.end(), [](const auto& a, const auto& b) + { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); + auto subrange_begin = unfilteredMappings.begin(); + auto subrange_end = unfilteredMappings.begin(); + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) + { + while (subrange_end != unfilteredMappings.end()) + { + if (param.skip_prefix) + { + int currGroup = this->refIdGroup[subrange_begin->refSeqId]; + subrange_end = std::find_if_not(subrange_begin, unfilteredMappings.end(), [this, currGroup] (const auto& unfilteredMappings_candidate) { + return currGroup == this->refIdGroup[unfilteredMappings_candidate.refSeqId]; + }); + } + else + { + subrange_end = unfilteredMappings.end(); + } + // TODO why are we filtering these before merging? + std::vector tmpMappings(std::distance(subrange_begin, subrange_end)); + std::move(subrange_begin, subrange_end, tmpMappings.begin()); + std::sort(tmpMappings.begin(), tmpMappings.end(), [](const auto& a, const auto& b) + { return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); }); + skch::Filter::query::filterMappings(tmpMappings, n_mappings); + std::move(tmpMappings.begin(), tmpMappings.end(), std::back_inserter(filteredMappings)); + subrange_begin = subrange_end; + } + } + //Sort the mappings by query (then reference) position + std::sort( + filteredMappings.begin(), filteredMappings.end(), + [](const MappingResult &a, const MappingResult &b) { + return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) + < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); + //return std::tie(a.refSeqId, a.refStartPos, a.queryStartPos) + //< std::tie(b.refSeqId, b.refStartPos, b.queryStartPos); + }); + } + + /** * @brief main mapping function given an input read * @details this function is run in parallel by multiple threads * @param[in] input input read details * @return output object containing the mappings */ - MapModuleOutput* mapModule (InputSeqContainer* input) + MapModuleOutput* mapModule (InputSeqProgContainer* input) { MapModuleOutput* output = new MapModuleOutput(); @@ -447,11 +547,12 @@ namespace skch bool split_mapping = true; std::vector intervalPoints; // Reserve the "expected" number of interval points - intervalPoints.reserve(2 * param.sketchSize * refSketch.minmerIndex.size() / refSketch.minmerPosLookupIndex.size()); - + intervalPoints.reserve( + 2 * param.sketchSize * refSketch.minmerIndex.size() / refSketch.minmerPosLookupIndex.size()); std::vector l1Mappings; - MappingResultsVector_t l2Mappings; + MappingResultsVector_t unfilteredMappings; + int refGroup = this->getRefGroup(input->seqName); if(! param.split || input->len <= param.segLength) { @@ -461,15 +562,18 @@ namespace skch Q.fullLen = input->len; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; + Q.refGroup = refGroup; //Map this sequence mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings); // save the output - output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end()); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); // indicate that we mapped full length split_mapping = false; + + input->progress.increment(input->len); } else //Split read mapping { @@ -485,6 +589,7 @@ namespace skch Q.fullLen = input->len; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; + Q.refGroup = refGroup; intervalPoints.clear(); l1Mappings.clear(); @@ -501,7 +606,8 @@ namespace skch }); // save the output - output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end()); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + input->progress.increment(param.segLength); } //Map last overlapping fragment to cover the whole read @@ -513,6 +619,7 @@ namespace skch Q.len = param.segLength; Q.seqCounter = input->seqCounter; Q.seqName = input->seqName; + Q.refGroup = refGroup; intervalPoints.clear(); l1Mappings.clear(); @@ -528,7 +635,9 @@ namespace skch e.queryEndPos = input->len; }); - output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end()); + unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end()); + + input->progress.increment(input->len % param.segLength); } } @@ -537,25 +646,44 @@ namespace skch param.numMappingsForShortSequence : param.numMappingsForSegment) - 1; - if (split_mapping) { - if (param.mergeMappings) { + // TODO this should happen after chaining i think + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + if (!param.stage1_topANI_filter) { + filterByGroup(unfilteredMappings, output->readMappings, n_mappings*100); + std::swap(unfilteredMappings, output->readMappings); + output->readMappings.clear(); + } + } + + if (split_mapping) + { + if (param.mergeMappings) + { // hardcore merge using the chain gap - //mergeMappingsInRange(output->readMappings, param.chain_gap); - mergeMappings(output->readMappings); - if (input->len >= param.block_length) { + mergeMappingsInRange(unfilteredMappings, param.chain_gap); + //mergeMappings(unfilteredMappings); + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + MappingResultsVector_t tempMappings; + tempMappings.reserve(output->readMappings.size()); + filterByGroup(unfilteredMappings, tempMappings, n_mappings); + std::swap(tempMappings, unfilteredMappings); + } + if (input->len >= param.block_length) + { // remove short chains that didn't exceed block length - filterWeakMappings(output->readMappings, std::floor(param.block_length / param.segLength)); + filterWeakMappings(unfilteredMappings, std::floor(param.block_length / param.segLength)); } } } - // TODO why are we filtering these before merging? - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { - skch::Filter::query::filterMappings(output->readMappings, n_mappings); - } + std::swap(output->readMappings, unfilteredMappings); // remove self-mode don't-maps this->filterSelfingLongToShorts(output->readMappings); + + // Currently causing issues, disabling for now + // remove alignments where the ratio between query and target length is < our identity threshold + //this->filterFalseHighIdentity(output->readMappings); //Make sure mapping boundary don't exceed sequence lengths this->mappingBoundarySanityCheck(input, output->readMappings); @@ -594,7 +722,7 @@ namespace skch reportReadMappings(output->readMappings, output->qseqName, outstrm); } - progress.increment(output->qseqLen/2 + (output->qseqLen % 2 != 0)); + //progress.increment(output->qseqLen/2 + (output->qseqLen % 2 != 0)); delete output; } @@ -622,9 +750,36 @@ namespace skch auto t1 = skch::Time::now(); #endif - //L2 Mapping - doL2Mapping(Q, l1Mappings, l2Mappings); + auto l1_begin = l1Mappings.begin(); + auto l1_end = l1Mappings.begin(); + while (l1_end != l1Mappings.end()) + { + if (param.skip_prefix) + { + int currGroup = this->refIdGroup[l1_begin->seqId]; + l1_end = std::find_if_not(l1_begin, l1Mappings.end(), [this, currGroup] (const auto& candidate) { + return currGroup == this->refIdGroup[candidate.seqId]; + }); + } + else + { + l1_end = l1Mappings.end(); + } + //Sort L1 windows based on intersection size if using hg filter + if (param.stage1_topANI_filter) + { + std::make_heap(l1_begin, l1_end, L1_locus_intersection_cmp); + } + doL2Mapping(Q, l1_begin, l1_end, l2Mappings); + + // Set beginning of next range + l1_begin = l1_end; + } + + // Sort output mappings + std::sort(l2Mappings.begin(), l2Mappings.end(), [](const auto& a, const auto& b) + { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); #ifdef ENABLE_TIME_PROFILE_L1_L2 { @@ -710,8 +865,12 @@ namespace skch while(!pq.empty()) { const IP_const_iterator ip_it = pq.front().it; - if (!param.skip_self || Q.seqName != this->refSketch.metadata[ip_it->seqId].name) - { + const auto& ref = this->refSketch.metadata[ip_it->seqId]; + if ((!param.skip_prefix && !param.skip_self) + || ((Q.fullLen <= ref.len) + && ((param.skip_self && Q.seqName != ref.name) + || (param.skip_prefix && this->refIdGroup[ip_it->seqId] != Q.refGroup))) + ) { intervalPoints.push_back(*ip_it); } std::pop_heap(pq.begin(), pq.end(), heap_cmp); @@ -732,8 +891,13 @@ namespace skch } - template - void computeL1CandidateRegions(Q_Info &Q, Vec1& intervalPoints, int minimumHits, Vec2 &l1Mappings) + template + void computeL1CandidateRegions( + Q_Info &Q, + IP_iter ip_begin, + IP_iter ip_end, + int minimumHits, + Vec2 &l1Mappings) { #ifdef DEBUG std::cerr << "INFO, skch::Map:computeL1CandidateRegions, read id " << Q.seqCounter << std::endl; @@ -746,8 +910,8 @@ namespace skch // Keep track of all minmer windows that intersect with [i, i+windowLen] int windowLen = std::max(0, Q.len - param.segLength); - auto trailingIt = intervalPoints.begin(); - auto leadingIt = intervalPoints.begin(); + auto trailingIt = ip_begin; + auto leadingIt = ip_begin; // Group together local sketch intersection maximums that are within clusterLen of eachother // @@ -761,11 +925,11 @@ namespace skch std::unordered_map hash_to_freq; if (param.stage1_topANI_filter) { - while (leadingIt != intervalPoints.end()) + while (leadingIt != ip_end) { // Catch the trailing iterator up to the leading iterator - windowLen while ( - trailingIt != intervalPoints.end() + trailingIt != ip_end && ((trailingIt->seqId == leadingIt->seqId && trailingIt->pos <= leadingIt->pos - windowLen) || trailingIt->seqId < leadingIt->seqId)) { @@ -779,7 +943,7 @@ namespace skch trailingIt++; } auto currentPos = leadingIt->pos; - while (leadingIt != intervalPoints.end() && leadingIt->pos == currentPos) { + while (leadingIt != ip_end && leadingIt->pos == currentPos) { if (leadingIt->side == side::OPEN) { if (windowLen == 0 || hash_to_freq[leadingIt->hash] == 0) { overlapCount++; @@ -820,8 +984,8 @@ namespace skch bool in_candidate = false; L1_candidateLocus_t l1_out = {}; - trailingIt = intervalPoints.begin(); - leadingIt = intervalPoints.begin(); + trailingIt = ip_begin; + leadingIt = ip_begin; // Keep track of 3 consecutive points so that we can track local optimums overlapCount = 0; @@ -832,7 +996,7 @@ namespace skch SeqCoord prevPos, currentPos; - while (leadingIt != intervalPoints.end()) + while (leadingIt != ip_end) { prevPrevOverlap = prevOverlap; prevOverlap = overlapCount; @@ -842,7 +1006,7 @@ namespace skch // right now, this basically happens since every CLOSE should be an OPEN. // This doesn't invalidate the logic, just potentially wastes time while ( - trailingIt != intervalPoints.end() + trailingIt != ip_end && ((trailingIt->seqId == leadingIt->seqId && trailingIt->pos <= leadingIt->pos - windowLen) || trailingIt->seqId < leadingIt->seqId)) { @@ -859,7 +1023,7 @@ namespace skch prevPos = currentPos; currentPos = SeqCoord{leadingIt->seqId, leadingIt->pos}; } - while (leadingIt != intervalPoints.end() && leadingIt->pos == currentPos.pos) + while (leadingIt != ip_end && leadingIt->pos == currentPos.pos) { if (leadingIt->side == side::OPEN) { if (windowLen == 0 || hash_to_freq[leadingIt->hash] == 0) { @@ -953,12 +1117,26 @@ namespace skch //3. Compute L1 windows int minimumHits = Stat::estimateMinimumHitsRelaxed(Q.sketchSize, param.kmerSize, param.percentageIdentity, skch::fixed::confidence_interval); - computeL1CandidateRegions(Q, intervalPoints, minimumHits, l1Mappings); - //4. Sort L1 windows based on intersection size if using hg filter - if (param.stage1_topANI_filter) + // For each "group" + auto ip_begin = intervalPoints.begin(); + auto ip_end = intervalPoints.begin(); + while (ip_end != intervalPoints.end()) { - std::make_heap(l1Mappings.begin(), l1Mappings.end(), L1_locus_intersection_cmp); + if (param.skip_prefix) + { + int currGroup = this->refIdGroup[ip_begin->seqId]; + ip_end = std::find_if_not(ip_begin, intervalPoints.end(), [this, currGroup] (const auto& ip) { + return currGroup == this->refIdGroup[ip.seqId]; + }); + } + else + { + ip_end = intervalPoints.end(); + } + computeL1CandidateRegions(Q, ip_begin, ip_end, minimumHits, l1Mappings); + + ip_begin = ip_end; } } @@ -975,20 +1153,14 @@ namespace skch * @param[in] l1Mappings candidate regions for query sequence found at L1 * @param[out] l2Mappings Mapping results in the L2 stage */ - template - void doL2Mapping(Q_Info &Q, VecIn &l1Mappings, VecOut &l2Mappings) + template + void doL2Mapping(Q_Info &Q, L1_Iter l1_begin, L1_Iter l1_end, VecOut &l2Mappings) { ///2. Walk the read over the candidate regions and compute the jaccard similarity with minimum s sketches std::vector l2_vec; - if (!param.stage1_topANI_filter) - { - // Only reserve if we know that we'll be storing each of the l1 mappings - l2Mappings.reserve(l1Mappings.size()); - } - double bestJaccardNumerator = 0; - auto loc_iterator = l1Mappings.begin(); - while (loc_iterator != l1Mappings.end()) + auto loc_iterator = l1_begin; + while (loc_iterator != l1_end) { L1_candidateLocus_t& candidateLocus = *loc_iterator; @@ -996,9 +1168,9 @@ namespace skch { // If using HG filter, don't consider any mappings which have no chance of being // within param.ANIDiff of the best mapping seen so far - double cutoff_ani = (1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff; + double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff)); double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize); - if (float(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) + if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) { break; } @@ -1007,6 +1179,19 @@ namespace skch l2_vec.clear(); computeL2MappedRegions(Q, candidateLocus, l2_vec); + //if (l2_vec.size() > 1) + //{ + //std::cerr << candidateLocus.seqId << ":" + //<< candidateLocus.rangeStartPos << "--" << candidateLocus.rangeEndPos << " == " << candidateLocus.rangeEndPos - candidateLocus.rangeStartPos + //<< std::endl; + //std::cerr << candidateLocus.intersectionSize + //<< "-->" << sketchCutoffs[candidateLocus.intersectionSize] << std::endl; + //for (const auto& l2 : l2_vec) + //{ + //std::cerr << l2.optimalStart << "--" << l2.optimalEnd << "\t" << l2.sharedSketchSize << std::endl; + //} + //std::cerr << std::endl; + //} for (auto& l2 : l2_vec) { @@ -1021,12 +1206,8 @@ namespace skch // if we are in all-vs-all mode, it isn't a self-mapping, // and if we are self-mapping, the query is shorter than the target const auto& ref = this->refSketch.metadata[l2.seqId]; - if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity + if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity) || nucIdentity >= param.percentageIdentity) - && !(param.skip_self && Q.seqName == ref.name) - && !(param.skip_prefix - && prefix(Q.seqName, param.prefix_delim) - == prefix(ref.name, param.prefix_delim))) { //Track the best jaccard numerator bestJaccardNumerator = std::max(bestJaccardNumerator, l2.sharedSketchSize); @@ -1059,23 +1240,17 @@ namespace skch if (param.stage1_topANI_filter) { - std::pop_heap(l1Mappings.begin(), l1Mappings.end(), L1_locus_intersection_cmp); - l1Mappings.pop_back(); + std::pop_heap(l1_begin, l1_end, L1_locus_intersection_cmp); + l1_end--; //"Pop back" } else { loc_iterator++; } } - std::sort(l2Mappings.begin(), l2Mappings.end(), [](auto& a, auto& b) - { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); }); //std::cerr << "For an segment with " << l1Mappings.size() //<< " L1 mappings " //<< " there were " << l2Mappings.size() << " L2 mappings\n"; - -#ifdef DEBUG - std::cerr << "[mashmap::skch::Map:doL2Mapping] read id " << Q.seqCounter << ", count of L1 candidates= " << l1Mappings.size() << ", count of L2 candidates= " << l2Mappings.size() << std::endl; -#endif } /** @@ -1519,7 +1694,7 @@ namespace skch * @param[in/out] readMappings Mappings computed by Mashmap (L2 stage) for a read */ template - void mappingBoundarySanityCheck(InputSeqContainer* input, VecIn &readMappings) + void mappingBoundarySanityCheck(InputSeqProgContainer* input, VecIn &readMappings) { for(auto &e : readMappings) { From efce4ecbb1267bf62096bebc4421f7a7a75b9749 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 14:36:42 -0500 Subject: [PATCH 05/15] Fix missing include found in #54 --- src/common/utils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/utils.hpp b/src/common/utils.hpp index f5646dc..119be9d 100644 --- a/src/common/utils.hpp +++ b/src/common/utils.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include From 3045335bc8bd67dd28629fa4363c93971c724e19 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 14:39:45 -0500 Subject: [PATCH 06/15] Update version number --- src/map/include/map_parameters.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index cda710f..f8ffbe7 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -91,7 +91,7 @@ float confidence_interval = 0.95; // Confidence interval to re float percentage_identity = 0.85; // Percent identity in the mapping step float ANIDiff = 0.0; // Stage 1 ANI diff threshold float ANIDiffConf = 0.999; // ANI diff confidence -std::string VERSION = "3.0.5"; // Version of MashMap +std::string VERSION = "3.0.6"; // Version of MashMap } } From f7c4856ed132b187c8a67d2fe85d210e8400ff25 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 15:25:13 -0500 Subject: [PATCH 07/15] Ignore k-mers w/ ambiguous nucleotides --- src/map/include/commonFunc.hpp | 39 ++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp index 76946ff..a4401bb 100644 --- a/src/map/include/commonFunc.hpp +++ b/src/map/include/commonFunc.hpp @@ -16,6 +16,7 @@ #include #include #include +#include //Own includes #include "map/include/map_parameters.hpp" @@ -186,9 +187,26 @@ namespace skch { ankerl::unordered_dense::map sketched_vals; std::vector sketched_heap; sketched_heap.reserve(sketchSize+1); + + // Get distance until last "N" + std::string_view first_kmer(seq, kmerSize); + int ambig_kmer_count = 0; + for (int i = kmerSize - 1; i >= 0; i--) + { + if (seq[i] == 'N') + { + ambig_kmer_count = i+1; + break; + } + } for(offset_t i = 0; i < len - kmerSize + 1; i++) { + + if (seq[i+kmerSize-1] == 'N') + { + ambig_kmer_count = kmerSize; + } //Hash kmers hash_t hashFwd = CommonFunc::getHash(seq + i, kmerSize); hash_t hashBwd; @@ -199,7 +217,7 @@ namespace skch { hashBwd = std::numeric_limits::max(); //Pick a dummy high value so that it is ignored later //Consider non-symmetric kmers only - if(hashBwd != hashFwd) + if(hashBwd != hashFwd && ambig_kmer_count == 0) { //Take minimum value of kmer and its reverse complement hash_t currentKmer = std::min(hashFwd, hashBwd); @@ -237,6 +255,10 @@ namespace skch { } } } + if (ambig_kmer_count > 0) + { + ambig_kmer_count--; + } } minmerIndex.resize(sketched_heap.size()); @@ -293,6 +315,10 @@ namespace skch { //if(alphabetSize == 4) //not protein //CommonFunc::reverseComplement(seq, seqRev.get(), len); + + // Get distance until last "N" + int ambig_kmer_count = 0; + for(offset_t i = 0; i < len - kmerSize + 1; i++) { @@ -369,8 +395,12 @@ namespace skch { Q.pop_front(); } + if (seq[i+kmerSize-1] == 'N') + { + ambig_kmer_count = kmerSize; + } //Consider non-symmetric kmers only - if(hashBwd != hashFwd) + if(hashBwd != hashFwd && ambig_kmer_count == 0) { // Add current hash to window Q.push_back(std::make_tuple(currentKmer, currentStrand, i)); @@ -399,6 +429,11 @@ namespace skch { std::push_heap(heapWindow.begin(), heapWindow.end(), KIHeap_cmp); } } + if (ambig_kmer_count > 0) + { + ambig_kmer_count--; + } + From f17cf64ab7e6b409aedd57a641ce8ae2d653cc57 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 15:26:10 -0500 Subject: [PATCH 08/15] Do not split sequences smaller than the block length --- src/map/include/computeMap.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index ad8ecb0..8396096 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -554,7 +554,7 @@ namespace skch MappingResultsVector_t unfilteredMappings; int refGroup = this->getRefGroup(input->seqName); - if(! param.split || input->len <= param.segLength) + if(! param.split || input->len <= param.segLength || input->len <= param.block_length) { QueryMetaData Q; Q.seq = &(input->seq)[0u]; From 7bc4d401ececcc994ae26f46e7103f52c1b5fbf5 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 15:26:44 -0500 Subject: [PATCH 09/15] Only filter once and after chaining --- src/map/include/computeMap.hpp | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 8396096..d872ce9 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -646,15 +646,6 @@ namespace skch param.numMappingsForShortSequence : param.numMappingsForSegment) - 1; - // TODO this should happen after chaining i think - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { - if (!param.stage1_topANI_filter) { - filterByGroup(unfilteredMappings, output->readMappings, n_mappings*100); - std::swap(unfilteredMappings, output->readMappings); - output->readMappings.clear(); - } - } - if (split_mapping) { if (param.mergeMappings) @@ -662,12 +653,6 @@ namespace skch // hardcore merge using the chain gap mergeMappingsInRange(unfilteredMappings, param.chain_gap); //mergeMappings(unfilteredMappings); - if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { - MappingResultsVector_t tempMappings; - tempMappings.reserve(output->readMappings.size()); - filterByGroup(unfilteredMappings, tempMappings, n_mappings); - std::swap(tempMappings, unfilteredMappings); - } if (input->len >= param.block_length) { // remove short chains that didn't exceed block length @@ -676,6 +661,13 @@ namespace skch } } + if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) { + MappingResultsVector_t tempMappings; + tempMappings.reserve(output->readMappings.size()); + filterByGroup(unfilteredMappings, tempMappings, n_mappings); + std::swap(tempMappings, unfilteredMappings); + } + std::swap(output->readMappings, unfilteredMappings); // remove self-mode don't-maps From e9b82f6f8281290e61468e54385592ccd21ad128 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 15:28:20 -0500 Subject: [PATCH 10/15] Add predicted sequence complexity tag (sc:f:x.xx) --- src/map/include/base_types.hpp | 2 ++ src/map/include/computeMap.hpp | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index d346f14..741ef1c 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -166,6 +166,7 @@ namespace skch //--for split read mapping + long double seqComplexity; // Estimated sequence complexity int n_merged; // how many mappings we've merged into this one offset_t splitMappingId; // To identify split mappings that are chained uint8_t discard; // set to 1 for deletion @@ -275,6 +276,7 @@ namespace skch MinmerVec minmerTableQuery; //Vector of minmers in the query MinmerVec seedHits; //Vector of minmers in the reference int refGroup; //Prefix group of sequence + float seqComplexity; //Estimated sequence complexity }; } diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index d872ce9..d721d80 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -7,6 +7,7 @@ #ifndef SKETCH_MAP_HPP #define SKETCH_MAP_HPP +#include #include #include #include @@ -800,6 +801,9 @@ namespace skch #ifdef DEBUG int orig_len = Q.minmerTableQuery.size(); #endif + const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits::max(); + Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); + // TODO remove them from the original sketch instead of removing for each read auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) { return refSketch.isFreqSeed(mi.hash); @@ -1222,6 +1226,7 @@ namespace skch res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos); res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0); res.strand = l2.strand; + res.seqComplexity = Q.seqComplexity; res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len); @@ -1533,6 +1538,10 @@ namespace skch it->nucIdentity = ( std::accumulate(it, it_end, 0.0, [](double x, MappingResult &e){ return x + e.nucIdentity; }) )/ std::distance(it, it_end); + //Mean sequence complexity of all mappings in the chain + it->seqComplexity = ( std::accumulate(it, it_end, 0.0, + [](double x, MappingResult &e){ return x + e.seqComplexity; }) )/ std::distance(it, it_end); + //Discard other mappings of this chain std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; }); @@ -1656,13 +1665,7 @@ namespace skch it->nucIdentity = ( std::accumulate( it, it_end, 0.0, [](double x, MappingResult &e){ return x + e.nucIdentity; }) - ) /// it->n_merged; // this would scale directly by the number of mappings in the chain - // this scales slightly by the amount of missing segments - / ( (double)it->n_merged - //+ std::pow( - //std::log((double)it->blockLength / param.segLength), - //0.01) - ); + ) / it->n_merged; // this would scale directly by the number of mappings in the chain //Discard other mappings of this chain @@ -1756,7 +1759,8 @@ namespace skch outstrm << sep << e.conservedSketches << sep << e.blockLength << sep << fakeMapQ - << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity; + << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity + << sep << "sc:f:" << e.seqComplexity; if (!param.mergeMappings) { outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize; From cc85484f476520b690c0e5d53d3df75cfa4c0b23 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 16:03:23 -0500 Subject: [PATCH 11/15] make length mismatch filter optional --- src/map/include/computeMap.hpp | 6 ++++-- src/map/include/map_parameters.hpp | 1 + src/map/include/parseCmdArgs.hpp | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index d721d80..4bb8beb 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -674,9 +674,11 @@ namespace skch // remove self-mode don't-maps this->filterSelfingLongToShorts(output->readMappings); - // Currently causing issues, disabling for now // remove alignments where the ratio between query and target length is < our identity threshold - //this->filterFalseHighIdentity(output->readMappings); + if (param.filterLengthMismatches) + { + this->filterFalseHighIdentity(output->readMappings); + } //Make sure mapping boundary don't exceed sequence lengths this->mappingBoundarySanityCheck(input, output->readMappings); diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index f8ffbe7..c2dada6 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -59,6 +59,7 @@ struct Parameters bool mergeMappings; //if we should merge consecutive segment mappings bool keep_low_pct_id; //true if we should keep mappings whose estimated identity < percentageIdentity bool report_ANI_percentage; //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash + bool filterLengthMismatches; //true if filtering out length mismatches int sketchSize; diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 74e16c4..7343d34 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -103,6 +103,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("hgFilterAniDiff", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", ArgvParser::OptionRequiresValue); cmd.defineOption("hgFilterConf", "Confidence value for the hypergeometric filtering [default: 0.999]", ArgvParser::OptionRequiresValue); + cmd.defineOption("filterLengthMismatches", "Filter mappings where the ratio of reference/query mapped lengths disagrees with the ANI threshold"); + cmd.defineOption("skipSelf", "skip self mappings when the query and target name is the same (for all-vs-all mode)"); cmd.defineOptionAlternative("skipSelf", "X"); @@ -346,6 +348,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir //Do not expose the option to set protein alphabet in mashmap //parameters.alphabetSize = 20; + parameters.filterLengthMismatches = cmd.foundOption("filterLengthMismatches"); parameters.stage1_topANI_filter = !cmd.foundOption("noHgFilter"); if(cmd.foundOption("filter_mode")) From 0ac569a9e648b27a39c3d541fc33c83cb2b1e67e Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 16:33:14 -0500 Subject: [PATCH 12/15] Remove string view --- src/map/include/commonFunc.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp index a4401bb..4ee72a2 100644 --- a/src/map/include/commonFunc.hpp +++ b/src/map/include/commonFunc.hpp @@ -16,7 +16,6 @@ #include #include #include -#include //Own includes #include "map/include/map_parameters.hpp" @@ -189,7 +188,6 @@ namespace skch { sketched_heap.reserve(sketchSize+1); // Get distance until last "N" - std::string_view first_kmer(seq, kmerSize); int ambig_kmer_count = 0; for (int i = kmerSize - 1; i >= 0; i--) { From 1db87eaf56b4fa042ed021a280dea46ba6419d62 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Sun, 9 Jul 2023 16:35:22 -0500 Subject: [PATCH 13/15] Remove debug code --- src/map/include/computeMap.hpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 4bb8beb..82bb71c 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1177,19 +1177,6 @@ namespace skch l2_vec.clear(); computeL2MappedRegions(Q, candidateLocus, l2_vec); - //if (l2_vec.size() > 1) - //{ - //std::cerr << candidateLocus.seqId << ":" - //<< candidateLocus.rangeStartPos << "--" << candidateLocus.rangeEndPos << " == " << candidateLocus.rangeEndPos - candidateLocus.rangeStartPos - //<< std::endl; - //std::cerr << candidateLocus.intersectionSize - //<< "-->" << sketchCutoffs[candidateLocus.intersectionSize] << std::endl; - //for (const auto& l2 : l2_vec) - //{ - //std::cerr << l2.optimalStart << "--" << l2.optimalEnd << "\t" << l2.sharedSketchSize << std::endl; - //} - //std::cerr << std::endl; - //} for (auto& l2 : l2_vec) { From 0ce0c7d7f70ed248a6cdc94f9b78ba99ff82bba1 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 10 Jul 2023 12:41:14 -0500 Subject: [PATCH 14/15] Add kmer complexity filter --- src/map/include/base_types.hpp | 4 ++-- src/map/include/computeMap.hpp | 15 ++++++++------- src/map/include/map_parameters.hpp | 1 + src/map/include/parseCmdArgs.hpp | 11 +++++++++++ 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index 741ef1c..c6e44c5 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -166,7 +166,7 @@ namespace skch //--for split read mapping - long double seqComplexity; // Estimated sequence complexity + long double kmerComplexity; // Estimated sequence complexity int n_merged; // how many mappings we've merged into this one offset_t splitMappingId; // To identify split mappings that are chained uint8_t discard; // set to 1 for deletion @@ -276,7 +276,7 @@ namespace skch MinmerVec minmerTableQuery; //Vector of minmers in the query MinmerVec seedHits; //Vector of minmers in the reference int refGroup; //Prefix group of sequence - float seqComplexity; //Estimated sequence complexity + float kmerComplexity; //Estimated sequence complexity }; } diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 82bb71c..e9df69f 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -804,7 +804,7 @@ namespace skch int orig_len = Q.minmerTableQuery.size(); #endif const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits::max(); - Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); + Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); // TODO remove them from the original sketch instead of removing for each read auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) { @@ -1191,8 +1191,9 @@ namespace skch // if we are in all-vs-all mode, it isn't a self-mapping, // and if we are self-mapping, the query is shorter than the target const auto& ref = this->refSketch.metadata[l2.seqId]; - if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity) - || nucIdentity >= param.percentageIdentity) + if((Q.kmerComplexity >= param.kmerComplexityThreshold) + && ((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity) + || nucIdentity >= param.percentageIdentity)) { //Track the best jaccard numerator bestJaccardNumerator = std::max(bestJaccardNumerator, l2.sharedSketchSize); @@ -1215,7 +1216,7 @@ namespace skch res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos); res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0); res.strand = l2.strand; - res.seqComplexity = Q.seqComplexity; + res.kmerComplexity = Q.kmerComplexity; res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len); @@ -1528,8 +1529,8 @@ namespace skch [](double x, MappingResult &e){ return x + e.nucIdentity; }) )/ std::distance(it, it_end); //Mean sequence complexity of all mappings in the chain - it->seqComplexity = ( std::accumulate(it, it_end, 0.0, - [](double x, MappingResult &e){ return x + e.seqComplexity; }) )/ std::distance(it, it_end); + it->kmerComplexity = ( std::accumulate(it, it_end, 0.0, + [](double x, MappingResult &e){ return x + e.kmerComplexity; }) )/ std::distance(it, it_end); //Discard other mappings of this chain std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; }); @@ -1749,7 +1750,7 @@ namespace skch << sep << e.blockLength << sep << fakeMapQ << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity - << sep << "sc:f:" << e.seqComplexity; + << sep << "kc:f:" << e.kmerComplexity; if (!param.mergeMappings) { outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index c2dada6..bdba569 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -60,6 +60,7 @@ struct Parameters bool keep_low_pct_id; //true if we should keep mappings whose estimated identity < percentageIdentity bool report_ANI_percentage; //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash bool filterLengthMismatches; //true if filtering out length mismatches + float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0) int sketchSize; diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 7343d34..23904bc 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -95,6 +95,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOptionAlternative("kmer","k"); cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue); + cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue); //cmd.defineOption("shortenCandidateRegions", "Only compute rolling minhash score for small regions around positions where the intersection of reference and query minmers is locally maximal. Results in slighty faster runtimes at the cost of mapping placement and ANI prediction."); @@ -510,6 +511,16 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir parameters.percentageIdentity = 0.85; str.clear(); + if(cmd.foundOption("kmerComplexity")) + { + str << cmd.optionValue("kmerComplexity"); + str >> parameters.kmerComplexityThreshold; + str.clear(); + } + else + parameters.kmerComplexityThreshold = 0.0; + str.clear(); + if (cmd.foundOption("hgFilterAniDiff")) { str << cmd.optionValue("hgFilterAniDiff"); From b440e04a3cc21fcbe00f97cfd702a8737384c840 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Mon, 10 Jul 2023 12:47:26 -0500 Subject: [PATCH 15/15] Make progcontainer a child class --- src/map/include/base_types.hpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index c6e44c5..3bdc4ed 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -225,12 +225,9 @@ namespace skch , seqName(id) { } }; - struct InputSeqProgContainer + struct InputSeqProgContainer : InputSeqContainer { - seqno_t seqCounter; //sequence counter - offset_t len; //sequence length - std::string seq; //sequence string - std::string seqName; //sequence id + using InputSeqContainer::InputSeqContainer; progress_meter::ProgressMeter& progress; //progress meter (shared) @@ -241,10 +238,7 @@ namespace skch * @param[in] len length of sequence */ InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm) - : seqCounter(seqcount) - , len(s.length()) - , seq(s) - , seqName(id) + : InputSeqContainer(s, id, seqcount) , progress(pm) { } };