From 671fc42945581002ae431e9e7cabbcc4f1087068 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Thu, 29 Jun 2023 15:13:12 -0500
Subject: [PATCH 01/15] Add option to report ANI as percentage in [0, 100]

---
 src/map/include/computeMap.hpp     | 2 +-
 src/map/include/map_parameters.hpp | 2 ++
 src/map/include/parseCmdArgs.hpp   | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 38445db..75a71f5 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1589,7 +1589,7 @@ namespace skch
             outstrm  << sep << e.conservedSketches
                      << sep << e.blockLength
                      << sep << fakeMapQ
-                     << sep << "id:f:" << e.nucIdentity;
+                     << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity;
             if (!param.mergeMappings) 
             {
               outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize;
diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
index 22c4512..cda710f 100644
--- a/src/map/include/map_parameters.hpp
+++ b/src/map/include/map_parameters.hpp
@@ -58,6 +58,8 @@ struct Parameters
     char prefix_delim;                                //the prefix delimiter
     bool mergeMappings;                               //if we should merge consecutive segment mappings
     bool keep_low_pct_id;                             //true if we should keep mappings whose estimated identity < percentageIdentity
+    bool report_ANI_percentage;                       //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash
+
 
     int sketchSize;
     bool use_spaced_seeds;                            //
diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp
index 1ad5d4d..74e16c4 100644
--- a/src/map/include/parseCmdArgs.hpp
+++ b/src/map/include/parseCmdArgs.hpp
@@ -123,6 +123,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
     cmd.defineOptionAlternative("filter_mode", "f");
 
     cmd.defineOption("legacy", "MashMap2 output format");
+    cmd.defineOption("reportPercentage", "Report predicted ANI values in [0, 100] instead of [0,1] (necessary for use with wfmash)");
   }
 
   /**
@@ -596,6 +597,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
       parameters.outFileName = "mashmap.out";
 
     parameters.legacy_output = cmd.foundOption("legacy");
+    parameters.report_ANI_percentage = cmd.foundOption("reportPercentage");
 
     printCmdOptions(parameters);
 

From d586a037df366229d3a3838053b4984fc9dd237c Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Tue, 4 Jul 2023 17:38:02 -0500
Subject: [PATCH 02/15] add missing include

---
 src/common/progress.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common/progress.hpp b/src/common/progress.hpp
index e63e531..fd9d5b9 100644
--- a/src/common/progress.hpp
+++ b/src/common/progress.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cmath>
 #include <iostream>
 #include <string>
 #include <atomic>

From 38f3e386cf10351a3d9d06fb450293b381e80f6c Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Tue, 4 Jul 2023 17:38:34 -0500
Subject: [PATCH 03/15] Add address sanitizer for debug

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 561a375..ab37515 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,8 @@ endif ()
 if (${CMAKE_BUILD_TYPE} MATCHES Debug)
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O -g")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O -g")
+    add_compile_options(-fsanitize=address)
+    add_link_options(-fsanitize=address)
 else()
     # Use all standard-compliant optimizations - always add these:
     set (CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${PIC_FLAG} ${EXTRA_FLAGS}")

From b9bf235e936cf376339fb04e17a62d288095bb93 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Tue, 4 Jul 2023 17:38:44 -0500
Subject: [PATCH 04/15] Working split-prefix filter

---
 src/map/include/base_types.hpp |  41 ++++-
 src/map/include/computeMap.hpp | 313 +++++++++++++++++++++++++--------
 2 files changed, 278 insertions(+), 76 deletions(-)

diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
index ff5d5f0..d346f14 100644
--- a/src/map/include/base_types.hpp
+++ b/src/map/include/base_types.hpp
@@ -10,6 +10,7 @@
 #include <tuple>
 #include <vector>
 #include <chrono>
+#include "common/progress.hpp"
 
 namespace skch
 {
@@ -204,10 +205,11 @@ namespace skch
   //Container to save copy of kseq object
   struct InputSeqContainer
   {
-    seqno_t seqCounter;                 //sequence counter
-    offset_t len;                       //sequence length
-    std::string seq;                    //sequence string
-    std::string seqName;                //sequence id
+    seqno_t seqCounter;                         //sequence counter
+    offset_t len;                               //sequence length
+    std::string seq;                            //sequence string
+    std::string seqName;                        //sequence id
+
 
     /*
      * @brief               constructor
@@ -216,12 +218,36 @@ namespace skch
      * @param[in] len       length of sequence
      */
       InputSeqContainer(const std::string& s, const std::string& id, seqno_t seqcount)
-          : seq(s)
-          , seqName(id)
+          : seqCounter(seqcount)
           , len(s.length())
-          , seqCounter(seqcount) { }
+          , seq(s)
+          , seqName(id) { }
   };
 
+  struct InputSeqProgContainer
+  {
+    seqno_t seqCounter;                         //sequence counter
+    offset_t len;                               //sequence length
+    std::string seq;                            //sequence string
+    std::string seqName;                        //sequence id
+    progress_meter::ProgressMeter& progress;    //progress meter (shared)
+                                                
+
+    /*
+     * @brief               constructor
+     * @param[in] kseq_seq  complete read or reference sequence
+     * @param[in] kseq_id   sequence id name
+     * @param[in] len       length of sequence
+     */
+      InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm)
+          : seqCounter(seqcount)
+          , len(s.length())
+          , seq(s)
+          , seqName(id)
+          , progress(pm) { }
+  };
+
+
   //Output type of map function
   struct MapModuleOutput
   {
@@ -248,6 +274,7 @@ namespace skch
       std::string seqName;                //sequence name
       MinmerVec minmerTableQuery;         //Vector of minmers in the query
       MinmerVec seedHits;                 //Vector of minmers in the reference
+      int refGroup;                       //Prefix group of sequence
     };
 }
 
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 75a71f5..ad8ecb0 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -106,6 +106,10 @@ namespace skch
       //for an L1 candidate if the best intersection size is i;
       std::vector<int> sketchCutoffs; 
 
+      //Vector for obtaining group from refId
+      //if refIdGroup[i] == refIdGroup[j], then sequence i and j have the same prefix;
+      std::vector<int> refIdGroup; 
+
     public:
 
       /**
@@ -117,18 +121,58 @@ namespace skch
       Map(const skch::Parameters &p, const skch::Sketch &refsketch,
           PostProcessResultsFn_t f = nullptr) :
         param(p),
-        sketchCutoffs(p.sketchSize + 1, 1),
         refSketch(refsketch),
-        processMappingResults(f)
+        processMappingResults(f),
+        sketchCutoffs(p.sketchSize + 1, 1),
+        refIdGroup(refsketch.metadata.size())
     {
       if (p.stage1_topANI_filter) {
         this->setProbs();
       }
+      if (p.skip_prefix)
+      {
+        this->setRefGroups();
+      }
       this->mapQuery();
     }
 
     private:
 
+      // Sets the groups of reference contigs based on prefix
+      void setRefGroups()
+      {
+        int group = 0;
+        int start_idx = 0;
+        int idx = 0;
+        while (start_idx < this->refSketch.metadata.size())
+        {
+          const auto currPrefix = prefix(this->refSketch.metadata[start_idx].name, param.prefix_delim);
+          idx = start_idx;
+          while (idx < this->refSketch.metadata.size()
+              && currPrefix == prefix(this->refSketch.metadata[idx].name, param.prefix_delim))
+          {
+            this->refIdGroup[idx++] = group;
+          }
+          group++;
+          start_idx = idx;
+        }
+      }
+
+      // Gets the ref group of a query based on the prefix
+      int getRefGroup(const std::string& seqName)
+      {
+        const auto queryPrefix = prefix(seqName, param.prefix_delim);
+        for (int i = 0; i < this->refSketch.metadata.size(); i++)
+        {
+          const auto currPrefix = prefix(this->refSketch.metadata[i].name, param.prefix_delim);
+          if (queryPrefix == currPrefix)
+          {
+            return this->refIdGroup[i];
+          }
+        }
+        // Doesn't belong to any ref group
+        return -1;
+      }
       void setProbs() 
       {
 
@@ -226,7 +270,7 @@ namespace skch
         MappingResultsVector_t allReadMappings;  //Aggregate mapping results for the complete run
 
         //Create the thread pool
-        ThreadPool<InputSeqContainer, MapModuleOutput> threadPool( [this](InputSeqContainer* e){return mapModule(e);}, param.threads);
+        ThreadPool<InputSeqProgContainer, MapModuleOutput> threadPool( [this](InputSeqProgContainer* e){return mapModule(e);}, param.threads);
 
         // kind've expensive if the fasta index is not available for the query sequences,
         // but it can help people know how long we're going to take
@@ -291,14 +335,14 @@ namespace skch
                     {
                         totalReadsPickedForMapping++;
                         //Dispatch input to thread
-                        threadPool.runWhenThreadAvailable(new InputSeqContainer(seq, seq_name, seqCounter));
+                        threadPool.runWhenThreadAvailable(new InputSeqProgContainer(seq, seq_name, seqCounter, progress));
 
                         //Collect output if available
                         while ( threadPool.outputAvailable() ) {
                             mapModuleHandleOutput(threadPool.popOutputWhenAvailable(), allReadMappings, totalReadsMapped, outstrm, progress);
                         }
                     }
-                    progress.increment(seq.size()/2);
+                    //progress.increment(seq.size()/2);
                     seqCounter++;
                 }); //Finish reading query input file
 
@@ -431,13 +475,69 @@ namespace skch
           }
       }
 
+      /**
+       * @brief               helper to main filtering function
+       * @details             filters mappings by group
+       * @param[in]   input   unfiltered mappings
+       * @param[in]   output  filtered mappings
+       * @param[in]   input   num mappings per segment
+       * @return              void
+       */
+      void filterByGroup(
+          MappingResultsVector_t &unfilteredMappings,
+          MappingResultsVector_t &filteredMappings,
+          int n_mappings)
+      {
+        filteredMappings.reserve(unfilteredMappings.size());
+
+        std::sort(unfilteredMappings.begin(), unfilteredMappings.end(), [](const auto& a, const auto& b) 
+            { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); });
+        auto subrange_begin = unfilteredMappings.begin();
+        auto subrange_end = unfilteredMappings.begin();
+        if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) 
+        {
+          while (subrange_end != unfilteredMappings.end())
+          {
+            if (param.skip_prefix)
+            {
+              int currGroup = this->refIdGroup[subrange_begin->refSeqId];
+              subrange_end = std::find_if_not(subrange_begin, unfilteredMappings.end(), [this, currGroup] (const auto& unfilteredMappings_candidate) {
+                  return currGroup == this->refIdGroup[unfilteredMappings_candidate.refSeqId];
+              });
+            }
+            else
+            {
+              subrange_end = unfilteredMappings.end();
+            }
+            // TODO why are we filtering these before merging?
+            std::vector<skch::MappingResult> tmpMappings(std::distance(subrange_begin, subrange_end));
+            std::move(subrange_begin, subrange_end, tmpMappings.begin());
+            std::sort(tmpMappings.begin(), tmpMappings.end(), [](const auto& a, const auto& b) 
+                { return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); });
+            skch::Filter::query::filterMappings(tmpMappings, n_mappings);
+            std::move(tmpMappings.begin(), tmpMappings.end(), std::back_inserter(filteredMappings));
+            subrange_begin = subrange_end;
+          }
+        }
+        //Sort the mappings by query (then reference) position
+        std::sort(
+            filteredMappings.begin(), filteredMappings.end(),
+            [](const MappingResult &a, const MappingResult &b) {
+                return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) 
+                  < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos);
+                //return std::tie(a.refSeqId, a.refStartPos, a.queryStartPos)
+                    //< std::tie(b.refSeqId, b.refStartPos, b.queryStartPos);
+            });
+      }
+
+
       /**
        * @brief               main mapping function given an input read
        * @details             this function is run in parallel by multiple threads
        * @param[in]   input   input read details
        * @return              output object containing the mappings
        */
-      MapModuleOutput* mapModule (InputSeqContainer* input)
+      MapModuleOutput* mapModule (InputSeqProgContainer* input)
       {
         MapModuleOutput* output = new MapModuleOutput();
 
@@ -447,11 +547,12 @@ namespace skch
         bool split_mapping = true;
         std::vector<IntervalPoint> intervalPoints;
         // Reserve the "expected" number of interval points
-        intervalPoints.reserve(2 * param.sketchSize * refSketch.minmerIndex.size() / refSketch.minmerPosLookupIndex.size());
-
+        intervalPoints.reserve(
+            2 * param.sketchSize * refSketch.minmerIndex.size() / refSketch.minmerPosLookupIndex.size());
         std::vector<L1_candidateLocus_t> l1Mappings;
-
         MappingResultsVector_t l2Mappings;
+        MappingResultsVector_t unfilteredMappings;
+        int refGroup = this->getRefGroup(input->seqName);
 
         if(! param.split || input->len <= param.segLength)
         {
@@ -461,15 +562,18 @@ namespace skch
           Q.fullLen = input->len;
           Q.seqCounter = input->seqCounter;
           Q.seqName = input->seqName;
+          Q.refGroup = refGroup;
 
           //Map this sequence
           mapSingleQueryFrag(Q, intervalPoints, l1Mappings, l2Mappings);
 
           // save the output
-          output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end());
+          unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
 
           // indicate that we mapped full length
           split_mapping = false;
+
+          input->progress.increment(input->len);
         }
         else  //Split read mapping
         {
@@ -485,6 +589,7 @@ namespace skch
             Q.fullLen = input->len;
             Q.seqCounter = input->seqCounter;
             Q.seqName = input->seqName;
+            Q.refGroup = refGroup;
 
             intervalPoints.clear();
             l1Mappings.clear();
@@ -501,7 +606,8 @@ namespace skch
                 });
 
             // save the output
-            output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end());
+            unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
+            input->progress.increment(param.segLength);
           }
 
           //Map last overlapping fragment to cover the whole read
@@ -513,6 +619,7 @@ namespace skch
             Q.len = param.segLength;
             Q.seqCounter = input->seqCounter;
             Q.seqName = input->seqName;
+            Q.refGroup = refGroup;
 
             intervalPoints.clear();
             l1Mappings.clear();
@@ -528,7 +635,9 @@ namespace skch
                 e.queryEndPos = input->len;
                 });
 
-            output->readMappings.insert(output->readMappings.end(), l2Mappings.begin(), l2Mappings.end());
+            unfilteredMappings.insert(unfilteredMappings.end(), l2Mappings.begin(), l2Mappings.end());
+
+            input->progress.increment(input->len % param.segLength);
           }
         }
 
@@ -537,25 +646,44 @@ namespace skch
                           param.numMappingsForShortSequence
                           : param.numMappingsForSegment) - 1;
 
-        if (split_mapping) {
-          if (param.mergeMappings) {
+        // TODO this should happen after chaining i think
+        if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {                      
+          if (!param.stage1_topANI_filter) {
+            filterByGroup(unfilteredMappings, output->readMappings, n_mappings*100);
+            std::swap(unfilteredMappings, output->readMappings);
+            output->readMappings.clear();
+          }
+        }
+
+        if (split_mapping) 
+        {
+          if (param.mergeMappings) 
+          {
             // hardcore merge using the chain gap
-            //mergeMappingsInRange(output->readMappings, param.chain_gap);
-            mergeMappings(output->readMappings);
-            if (input->len >= param.block_length) {
+            mergeMappingsInRange(unfilteredMappings, param.chain_gap);
+            //mergeMappings(unfilteredMappings);
+            if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {                      
+              MappingResultsVector_t tempMappings;
+              tempMappings.reserve(output->readMappings.size());
+              filterByGroup(unfilteredMappings, tempMappings, n_mappings);
+              std::swap(tempMappings, unfilteredMappings);
+            }
+            if (input->len >= param.block_length) 
+            {
               // remove short chains that didn't exceed block length
-              filterWeakMappings(output->readMappings, std::floor(param.block_length / param.segLength));
+              filterWeakMappings(unfilteredMappings, std::floor(param.block_length / param.segLength));
             }
           }
         }
 
-        // TODO why are we filtering these before merging?
-        if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
-            skch::Filter::query::filterMappings(output->readMappings, n_mappings);
-        }
+        std::swap(output->readMappings, unfilteredMappings);
 
         // remove self-mode don't-maps
         this->filterSelfingLongToShorts(output->readMappings);
+        
+        // Currently causing issues, disabling for now
+        // remove alignments where the ratio between query and target length is < our identity threshold
+        //this->filterFalseHighIdentity(output->readMappings);
 
         //Make sure mapping boundary don't exceed sequence lengths
         this->mappingBoundarySanityCheck(input, output->readMappings);
@@ -594,7 +722,7 @@ namespace skch
             reportReadMappings(output->readMappings, output->qseqName, outstrm);
           }
 
-          progress.increment(output->qseqLen/2 + (output->qseqLen % 2 != 0));
+          //progress.increment(output->qseqLen/2 + (output->qseqLen % 2 != 0));
 
           delete output;
         }
@@ -622,9 +750,36 @@ namespace skch
           auto t1 = skch::Time::now();
 #endif
 
-          //L2 Mapping
-          doL2Mapping(Q, l1Mappings, l2Mappings);
+          auto l1_begin = l1Mappings.begin();
+          auto l1_end = l1Mappings.begin();
+          while (l1_end != l1Mappings.end())
+          {
+            if (param.skip_prefix)
+            {
+              int currGroup = this->refIdGroup[l1_begin->seqId];
+              l1_end = std::find_if_not(l1_begin, l1Mappings.end(), [this, currGroup] (const auto& candidate) {
+                  return currGroup == this->refIdGroup[candidate.seqId];
+              });
+            }
+            else
+            {
+              l1_end = l1Mappings.end();
+            }
 
+            //Sort L1 windows based on intersection size if using hg filter
+            if (param.stage1_topANI_filter)
+            {
+              std::make_heap(l1_begin, l1_end, L1_locus_intersection_cmp);
+            }
+            doL2Mapping(Q, l1_begin, l1_end, l2Mappings);
+
+            // Set beginning of next range
+            l1_begin = l1_end;
+          }
+
+          // Sort output mappings
+          std::sort(l2Mappings.begin(), l2Mappings.end(), [](const auto& a, const auto& b) 
+              { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); });
 
 #ifdef ENABLE_TIME_PROFILE_L1_L2
           {
@@ -710,8 +865,12 @@ namespace skch
           while(!pq.empty())
           {
             const IP_const_iterator ip_it = pq.front().it;
-            if (!param.skip_self || Q.seqName != this->refSketch.metadata[ip_it->seqId].name) 
-            {
+            const auto& ref = this->refSketch.metadata[ip_it->seqId];
+            if ((!param.skip_prefix && !param.skip_self)
+                || ((Q.fullLen <= ref.len)
+                  && ((param.skip_self && Q.seqName != ref.name)
+                      || (param.skip_prefix && this->refIdGroup[ip_it->seqId] != Q.refGroup)))
+            ) {
               intervalPoints.push_back(*ip_it);
             }
             std::pop_heap(pq.begin(), pq.end(), heap_cmp);
@@ -732,8 +891,13 @@ namespace skch
         }
 
 
-      template <typename Q_Info, typename Vec1, typename Vec2>
-        void computeL1CandidateRegions(Q_Info &Q, Vec1& intervalPoints, int minimumHits, Vec2 &l1Mappings)
+      template <typename Q_Info, typename IP_iter, typename Vec2>
+        void computeL1CandidateRegions(
+            Q_Info &Q, 
+            IP_iter ip_begin, 
+            IP_iter ip_end, 
+            int minimumHits, 
+            Vec2 &l1Mappings)
         {
 #ifdef DEBUG
           std::cerr << "INFO, skch::Map:computeL1CandidateRegions, read id " << Q.seqCounter << std::endl;
@@ -746,8 +910,8 @@ namespace skch
 
           // Keep track of all minmer windows that intersect with [i, i+windowLen]
           int windowLen = std::max<offset_t>(0, Q.len - param.segLength);
-          auto trailingIt = intervalPoints.begin();
-          auto leadingIt = intervalPoints.begin();
+          auto trailingIt = ip_begin;
+          auto leadingIt = ip_begin;
 
           // Group together local sketch intersection maximums that are within clusterLen of eachother
           //
@@ -761,11 +925,11 @@ namespace skch
           std::unordered_map<hash_t, int> hash_to_freq;
 
           if (param.stage1_topANI_filter) {
-            while (leadingIt != intervalPoints.end())
+            while (leadingIt != ip_end)
             {
               // Catch the trailing iterator up to the leading iterator - windowLen
               while (
-                  trailingIt != intervalPoints.end() 
+                  trailingIt != ip_end 
                   && ((trailingIt->seqId == leadingIt->seqId && trailingIt->pos <= leadingIt->pos - windowLen)
                     || trailingIt->seqId < leadingIt->seqId))
               {
@@ -779,7 +943,7 @@ namespace skch
                 trailingIt++;
               }
               auto currentPos = leadingIt->pos;
-              while (leadingIt != intervalPoints.end() && leadingIt->pos == currentPos) {
+              while (leadingIt != ip_end && leadingIt->pos == currentPos) {
                 if (leadingIt->side == side::OPEN) {
                   if (windowLen == 0 || hash_to_freq[leadingIt->hash] == 0) {
                     overlapCount++;
@@ -820,8 +984,8 @@ namespace skch
 
           bool in_candidate = false;
           L1_candidateLocus_t l1_out = {};
-          trailingIt = intervalPoints.begin();
-          leadingIt = intervalPoints.begin();
+          trailingIt = ip_begin;
+          leadingIt = ip_begin;
 
           // Keep track of 3 consecutive points so that we can track local optimums
           overlapCount = 0;
@@ -832,7 +996,7 @@ namespace skch
           SeqCoord prevPos, currentPos;
 
 
-          while (leadingIt != intervalPoints.end())
+          while (leadingIt != ip_end)
           {
             prevPrevOverlap = prevOverlap;
             prevOverlap = overlapCount;
@@ -842,7 +1006,7 @@ namespace skch
             // right now, this basically happens since every CLOSE should be an OPEN.
             // This doesn't invalidate the logic, just potentially wastes time
             while (
-                trailingIt != intervalPoints.end() 
+                trailingIt != ip_end 
                 && ((trailingIt->seqId == leadingIt->seqId && trailingIt->pos <= leadingIt->pos - windowLen)
                   || trailingIt->seqId < leadingIt->seqId))
             {
@@ -859,7 +1023,7 @@ namespace skch
               prevPos = currentPos;
               currentPos = SeqCoord{leadingIt->seqId, leadingIt->pos};
             }
-            while (leadingIt != intervalPoints.end() && leadingIt->pos == currentPos.pos) 
+            while (leadingIt != ip_end && leadingIt->pos == currentPos.pos) 
             {
               if (leadingIt->side == side::OPEN) {
                 if (windowLen == 0 || hash_to_freq[leadingIt->hash] == 0) {
@@ -953,12 +1117,26 @@ namespace skch
 
           //3. Compute L1 windows
           int minimumHits = Stat::estimateMinimumHitsRelaxed(Q.sketchSize, param.kmerSize, param.percentageIdentity, skch::fixed::confidence_interval);
-          computeL1CandidateRegions(Q, intervalPoints, minimumHits, l1Mappings);
 
-          //4. Sort L1 windows based on intersection size if using hg filter
-          if (param.stage1_topANI_filter)
+          // For each "group"
+          auto ip_begin = intervalPoints.begin();
+          auto ip_end = intervalPoints.begin();
+          while (ip_end != intervalPoints.end())
           {
-            std::make_heap(l1Mappings.begin(), l1Mappings.end(), L1_locus_intersection_cmp);
+            if (param.skip_prefix)
+            {
+              int currGroup = this->refIdGroup[ip_begin->seqId];
+              ip_end = std::find_if_not(ip_begin, intervalPoints.end(), [this, currGroup] (const auto& ip) {
+                  return currGroup == this->refIdGroup[ip.seqId];
+              });
+            }
+            else
+            {
+              ip_end = intervalPoints.end();
+            }
+            computeL1CandidateRegions(Q, ip_begin, ip_end, minimumHits, l1Mappings);
+
+            ip_begin = ip_end;
           }
         }
 
@@ -975,20 +1153,14 @@ namespace skch
        * @param[in]   l1Mappings                candidate regions for query sequence found at L1
        * @param[out]  l2Mappings                Mapping results in the L2 stage
        */
-      template <typename Q_Info, typename VecIn, typename VecOut>
-        void doL2Mapping(Q_Info &Q, VecIn &l1Mappings, VecOut &l2Mappings)
+      template <typename Q_Info, typename L1_Iter, typename VecOut>
+        void doL2Mapping(Q_Info &Q, L1_Iter l1_begin, L1_Iter l1_end, VecOut &l2Mappings)
         {
           ///2. Walk the read over the candidate regions and compute the jaccard similarity with minimum s sketches
           std::vector<L2_mapLocus_t> l2_vec;
-          if (!param.stage1_topANI_filter)
-          {
-            // Only reserve if we know that we'll be storing each of the l1 mappings
-            l2Mappings.reserve(l1Mappings.size());
-          }
-
           double bestJaccardNumerator = 0;
-          auto loc_iterator = l1Mappings.begin();
-          while (loc_iterator != l1Mappings.end())
+          auto loc_iterator = l1_begin;
+          while (loc_iterator != l1_end)
           {
             L1_candidateLocus_t& candidateLocus = *loc_iterator;
 
@@ -996,9 +1168,9 @@ namespace skch
             {
               // If using HG filter, don't consider any mappings which have no chance of being 
               // within param.ANIDiff of the best mapping seen so far
-              double cutoff_ani = (1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff;
+              double cutoff_ani = std::max(0.0, double((1 - Stat::j2md(bestJaccardNumerator / Q.sketchSize, param.kmerSize)) - param.ANIDiff));
               double cutoff_j = Stat::md2j(1 - cutoff_ani, param.kmerSize);
-              if (float(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) 
+              if (double(candidateLocus.intersectionSize) / Q.sketchSize < cutoff_j) 
               {
                 break;
               }
@@ -1007,6 +1179,19 @@ namespace skch
 
             l2_vec.clear();
             computeL2MappedRegions(Q, candidateLocus, l2_vec);
+            //if (l2_vec.size() > 1)
+            //{
+              //std::cerr <<  candidateLocus.seqId << ":"
+               //<< candidateLocus.rangeStartPos << "--" << candidateLocus.rangeEndPos  << " == " << candidateLocus.rangeEndPos - candidateLocus.rangeStartPos
+               //<< std::endl;
+              //std::cerr << candidateLocus.intersectionSize
+               //<< "-->" << sketchCutoffs[candidateLocus.intersectionSize] << std::endl;
+              //for (const auto& l2 : l2_vec)
+              //{
+                //std::cerr << l2.optimalStart << "--" << l2.optimalEnd << "\t" << l2.sharedSketchSize << std::endl;  
+              //}
+              //std::cerr << std::endl;
+            //}
 
             for (auto& l2 : l2_vec) 
             {
@@ -1021,12 +1206,8 @@ namespace skch
               // if we are in all-vs-all mode, it isn't a self-mapping,
               // and if we are self-mapping, the query is shorter than the target
               const auto& ref = this->refSketch.metadata[l2.seqId];
-              if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity
+              if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
                   || nucIdentity >= param.percentageIdentity)
-                 && !(param.skip_self && Q.seqName == ref.name)
-                 && !(param.skip_prefix
-                      && prefix(Q.seqName, param.prefix_delim)
-                      == prefix(ref.name, param.prefix_delim)))
               {
                 //Track the best jaccard numerator
                 bestJaccardNumerator = std::max<double>(bestJaccardNumerator, l2.sharedSketchSize);
@@ -1059,23 +1240,17 @@ namespace skch
 
             if (param.stage1_topANI_filter) 
             {
-              std::pop_heap(l1Mappings.begin(), l1Mappings.end(), L1_locus_intersection_cmp); 
-              l1Mappings.pop_back();
+              std::pop_heap(l1_begin, l1_end, L1_locus_intersection_cmp); 
+              l1_end--; //"Pop back" 
             }
             else 
             {
               loc_iterator++;
             }
           }
-          std::sort(l2Mappings.begin(), l2Mappings.end(), [](auto& a, auto& b) 
-              { return std::tie(a.refSeqId, a.refStartPos) < std::tie(b.refSeqId, b.refStartPos); });
           //std::cerr << "For an segment with " << l1Mappings.size()
             //<< " L1 mappings "
             //<< " there were " << l2Mappings.size() << " L2 mappings\n";
-
-#ifdef DEBUG
-          std::cerr << "[mashmap::skch::Map:doL2Mapping] read id " << Q.seqCounter << ", count of L1 candidates= " << l1Mappings.size() << ", count of L2 candidates= " << l2Mappings.size() << std::endl;
-#endif
         }
 
       /**
@@ -1519,7 +1694,7 @@ namespace skch
        * @param[in/out] readMappings  Mappings computed by Mashmap (L2 stage) for a read
        */
       template <typename VecIn>
-        void mappingBoundarySanityCheck(InputSeqContainer* input, VecIn &readMappings)
+        void mappingBoundarySanityCheck(InputSeqProgContainer* input, VecIn &readMappings)
         {
           for(auto &e : readMappings)
           {

From efce4ecbb1267bf62096bebc4421f7a7a75b9749 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 14:36:42 -0500
Subject: [PATCH 05/15] Fix missing include found in #54

---
 src/common/utils.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common/utils.hpp b/src/common/utils.hpp
index f5646dc..119be9d 100644
--- a/src/common/utils.hpp
+++ b/src/common/utils.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <cmath>
 #include <string>
 

From 3045335bc8bd67dd28629fa4363c93971c724e19 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 14:39:45 -0500
Subject: [PATCH 06/15] Update version number

---
 src/map/include/map_parameters.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
index cda710f..f8ffbe7 100644
--- a/src/map/include/map_parameters.hpp
+++ b/src/map/include/map_parameters.hpp
@@ -91,7 +91,7 @@ float confidence_interval = 0.95;                   // Confidence interval to re
 float percentage_identity = 0.85;                   // Percent identity in the mapping step
 float ANIDiff = 0.0;                                // Stage 1 ANI diff threshold
 float ANIDiffConf = 0.999;                          // ANI diff confidence
-std::string VERSION = "3.0.5";                      // Version of MashMap
+std::string VERSION = "3.0.6";                      // Version of MashMap
 }
 }
 

From f7c4856ed132b187c8a67d2fe85d210e8400ff25 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 15:25:13 -0500
Subject: [PATCH 07/15] Ignore k-mers w/ ambiguous nucleotides

---
 src/map/include/commonFunc.hpp | 39 ++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp
index 76946ff..a4401bb 100644
--- a/src/map/include/commonFunc.hpp
+++ b/src/map/include/commonFunc.hpp
@@ -16,6 +16,7 @@
 #include <numeric>
 #include <queue>
 #include <sstream>
+#include <string_view>
 
 //Own includes
 #include "map/include/map_parameters.hpp"
@@ -186,9 +187,26 @@ namespace skch {
           ankerl::unordered_dense::map<hash_t, MinmerInfo> sketched_vals;
           std::vector<hash_t> sketched_heap;
           sketched_heap.reserve(sketchSize+1);
+            
+          // Get distance until last "N"
+          std::string_view first_kmer(seq, kmerSize);
+          int ambig_kmer_count = 0;
+          for (int i = kmerSize - 1; i >= 0; i--)
+          {
+            if (seq[i] == 'N')
+            {
+                ambig_kmer_count = i+1;
+                break;
+            }    
+          } 
 
           for(offset_t i = 0; i < len - kmerSize + 1; i++)
           {
+
+            if (seq[i+kmerSize-1] == 'N')
+            {
+              ambig_kmer_count = kmerSize;
+            }
             //Hash kmers
             hash_t hashFwd = CommonFunc::getHash(seq + i, kmerSize); 
             hash_t hashBwd;
@@ -199,7 +217,7 @@ namespace skch {
               hashBwd = std::numeric_limits<hash_t>::max();   //Pick a dummy high value so that it is ignored later
 
             //Consider non-symmetric kmers only
-            if(hashBwd != hashFwd)
+            if(hashBwd != hashFwd && ambig_kmer_count == 0)
             {
               //Take minimum value of kmer and its reverse complement
               hash_t currentKmer = std::min(hashFwd, hashBwd);
@@ -237,6 +255,10 @@ namespace skch {
                 }
               }
             }
+            if (ambig_kmer_count > 0)
+            {
+              ambig_kmer_count--;
+            }
           }
 
           minmerIndex.resize(sketched_heap.size());
@@ -293,6 +315,10 @@ namespace skch {
 
             //if(alphabetSize == 4) //not protein
               //CommonFunc::reverseComplement(seq, seqRev.get(), len);
+            
+            // Get distance until last "N"
+            int ambig_kmer_count = 0;
+
 
             for(offset_t i = 0; i < len - kmerSize + 1; i++)
             {
@@ -369,8 +395,12 @@ namespace skch {
                 Q.pop_front();
               }
 
+              if (seq[i+kmerSize-1] == 'N')
+              {
+                ambig_kmer_count = kmerSize;
+              }
               //Consider non-symmetric kmers only
-              if(hashBwd != hashFwd)
+              if(hashBwd != hashFwd && ambig_kmer_count == 0)
               {
                 // Add current hash to window
                 Q.push_back(std::make_tuple(currentKmer, currentStrand, i)); 
@@ -399,6 +429,11 @@ namespace skch {
                   std::push_heap(heapWindow.begin(), heapWindow.end(), KIHeap_cmp);
                 }
               }
+              if (ambig_kmer_count > 0)
+              {
+                ambig_kmer_count--;
+              }
+              
 
 
 

From f17cf64ab7e6b409aedd57a641ce8ae2d653cc57 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 15:26:10 -0500
Subject: [PATCH 08/15] Do not split sequences smaller than the block length

---
 src/map/include/computeMap.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index ad8ecb0..8396096 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -554,7 +554,7 @@ namespace skch
         MappingResultsVector_t unfilteredMappings;
         int refGroup = this->getRefGroup(input->seqName);
 
-        if(! param.split || input->len <= param.segLength)
+        if(! param.split || input->len <= param.segLength || input->len <= param.block_length)
         {
           QueryMetaData <MinVec_Type> Q;
           Q.seq = &(input->seq)[0u];

From 7bc4d401ececcc994ae26f46e7103f52c1b5fbf5 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 15:26:44 -0500
Subject: [PATCH 09/15] Only filter once and after chaining

---
 src/map/include/computeMap.hpp | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 8396096..d872ce9 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -646,15 +646,6 @@ namespace skch
                           param.numMappingsForShortSequence
                           : param.numMappingsForSegment) - 1;
 
-        // TODO this should happen after chaining i think
-        if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {                      
-          if (!param.stage1_topANI_filter) {
-            filterByGroup(unfilteredMappings, output->readMappings, n_mappings*100);
-            std::swap(unfilteredMappings, output->readMappings);
-            output->readMappings.clear();
-          }
-        }
-
         if (split_mapping) 
         {
           if (param.mergeMappings) 
@@ -662,12 +653,6 @@ namespace skch
             // hardcore merge using the chain gap
             mergeMappingsInRange(unfilteredMappings, param.chain_gap);
             //mergeMappings(unfilteredMappings);
-            if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {                      
-              MappingResultsVector_t tempMappings;
-              tempMappings.reserve(output->readMappings.size());
-              filterByGroup(unfilteredMappings, tempMappings, n_mappings);
-              std::swap(tempMappings, unfilteredMappings);
-            }
             if (input->len >= param.block_length) 
             {
               // remove short chains that didn't exceed block length
@@ -676,6 +661,13 @@ namespace skch
           }
         }
 
+        if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {                      
+          MappingResultsVector_t tempMappings;
+          tempMappings.reserve(output->readMappings.size());
+          filterByGroup(unfilteredMappings, tempMappings, n_mappings);
+          std::swap(tempMappings, unfilteredMappings);
+        }
+
         std::swap(output->readMappings, unfilteredMappings);
 
         // remove self-mode don't-maps

From e9b82f6f8281290e61468e54385592ccd21ad128 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 15:28:20 -0500
Subject: [PATCH 10/15] Add predicted sequence complexity tag (sc:f:x.xx)

---
 src/map/include/base_types.hpp |  2 ++
 src/map/include/computeMap.hpp | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
index d346f14..741ef1c 100644
--- a/src/map/include/base_types.hpp
+++ b/src/map/include/base_types.hpp
@@ -166,6 +166,7 @@ namespace skch
 
                                                         //--for split read mapping
 
+    long double seqComplexity;                               // Estimated sequence complexity
     int n_merged;                                       // how many mappings we've merged into this one
     offset_t splitMappingId;                            // To identify split mappings that are chained
     uint8_t discard;                                    // set to 1 for deletion
@@ -275,6 +276,7 @@ namespace skch
       MinmerVec minmerTableQuery;         //Vector of minmers in the query
       MinmerVec seedHits;                 //Vector of minmers in the reference
       int refGroup;                       //Prefix group of sequence
+      float seqComplexity;                //Estimated sequence complexity
     };
 }
 
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index d872ce9..d721d80 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -7,6 +7,7 @@
 #ifndef SKETCH_MAP_HPP
 #define SKETCH_MAP_HPP
 
+#include <limits>
 #include <vector>
 #include <algorithm>
 #include <unordered_map>
@@ -800,6 +801,9 @@ namespace skch
 #ifdef DEBUG
           int orig_len = Q.minmerTableQuery.size();
 #endif
+          const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
+          Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
+
           // TODO remove them from the original sketch instead of removing for each read
           auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
             return refSketch.isFreqSeed(mi.hash);
@@ -1222,6 +1226,7 @@ namespace skch
                   res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos);
                   res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0);
                   res.strand = l2.strand; 
+                  res.seqComplexity = Q.seqComplexity;
 
                   res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len);
 
@@ -1533,6 +1538,10 @@ namespace skch
             it->nucIdentity = (   std::accumulate(it, it_end, 0.0,
                                   [](double x, MappingResult &e){ return x + e.nucIdentity; })     )/ std::distance(it, it_end);
 
+            //Mean sequence complexity of all mappings in the chain
+            it->seqComplexity = (   std::accumulate(it, it_end, 0.0,
+                                  [](double x, MappingResult &e){ return x + e.seqComplexity; })     )/ std::distance(it, it_end);
+
             //Discard other mappings of this chain
             std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; });
 
@@ -1656,13 +1665,7 @@ namespace skch
               it->nucIdentity = ( std::accumulate(
                                       it, it_end, 0.0,
                                       [](double x, MappingResult &e){ return x + e.nucIdentity; })
-                  ) /// it->n_merged; // this would scale directly by the number of mappings in the chain
-                  // this scales slightly by the amount of missing segments
-                  / ( (double)it->n_merged
-                      //+ std::pow(
-                          //std::log((double)it->blockLength / param.segLength),
-                          //0.01)
-                      );
+                  ) / it->n_merged; // this would scale directly by the number of mappings in the chain
 
 
               //Discard other mappings of this chain
@@ -1756,7 +1759,8 @@ namespace skch
             outstrm  << sep << e.conservedSketches
                      << sep << e.blockLength
                      << sep << fakeMapQ
-                     << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity;
+                     << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity
+                     << sep << "sc:f:" << e.seqComplexity;
             if (!param.mergeMappings) 
             {
               outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize;

From cc85484f476520b690c0e5d53d3df75cfa4c0b23 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 16:03:23 -0500
Subject: [PATCH 11/15] make length mismatch filter optional

---
 src/map/include/computeMap.hpp     | 6 ++++--
 src/map/include/map_parameters.hpp | 1 +
 src/map/include/parseCmdArgs.hpp   | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index d721d80..4bb8beb 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -674,9 +674,11 @@ namespace skch
         // remove self-mode don't-maps
         this->filterSelfingLongToShorts(output->readMappings);
         
-        // Currently causing issues, disabling for now
         // remove alignments where the ratio between query and target length is < our identity threshold
-        //this->filterFalseHighIdentity(output->readMappings);
+        if (param.filterLengthMismatches)
+        {
+          this->filterFalseHighIdentity(output->readMappings);
+        }
 
         //Make sure mapping boundary don't exceed sequence lengths
         this->mappingBoundarySanityCheck(input, output->readMappings);
diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
index f8ffbe7..c2dada6 100644
--- a/src/map/include/map_parameters.hpp
+++ b/src/map/include/map_parameters.hpp
@@ -59,6 +59,7 @@ struct Parameters
     bool mergeMappings;                               //if we should merge consecutive segment mappings
     bool keep_low_pct_id;                             //true if we should keep mappings whose estimated identity < percentageIdentity
     bool report_ANI_percentage;                       //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash
+    bool filterLengthMismatches;                      //true if filtering out length mismatches
 
 
     int sketchSize;
diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp
index 74e16c4..7343d34 100644
--- a/src/map/include/parseCmdArgs.hpp
+++ b/src/map/include/parseCmdArgs.hpp
@@ -103,6 +103,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
     cmd.defineOption("hgFilterAniDiff", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", ArgvParser::OptionRequiresValue);
     cmd.defineOption("hgFilterConf", "Confidence value for the hypergeometric filtering [default: 0.999]", ArgvParser::OptionRequiresValue);
 
+    cmd.defineOption("filterLengthMismatches", "Filter mappings where the ratio of reference/query mapped lengths disagrees with the ANI threshold");
+
     cmd.defineOption("skipSelf", "skip self mappings when the query and target name is the same (for all-vs-all mode)");
     cmd.defineOptionAlternative("skipSelf", "X");
 
@@ -346,6 +348,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
     //Do not expose the option to set protein alphabet in mashmap
     //parameters.alphabetSize = 20;
     
+    parameters.filterLengthMismatches = cmd.foundOption("filterLengthMismatches"); 
     parameters.stage1_topANI_filter = !cmd.foundOption("noHgFilter"); 
 
     if(cmd.foundOption("filter_mode"))

From 0ac569a9e648b27a39c3d541fc33c83cb2b1e67e Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 16:33:14 -0500
Subject: [PATCH 12/15] Remove string view

---
 src/map/include/commonFunc.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/map/include/commonFunc.hpp b/src/map/include/commonFunc.hpp
index a4401bb..4ee72a2 100644
--- a/src/map/include/commonFunc.hpp
+++ b/src/map/include/commonFunc.hpp
@@ -16,7 +16,6 @@
 #include <numeric>
 #include <queue>
 #include <sstream>
-#include <string_view>
 
 //Own includes
 #include "map/include/map_parameters.hpp"
@@ -189,7 +188,6 @@ namespace skch {
           sketched_heap.reserve(sketchSize+1);
             
           // Get distance until last "N"
-          std::string_view first_kmer(seq, kmerSize);
           int ambig_kmer_count = 0;
           for (int i = kmerSize - 1; i >= 0; i--)
           {

From 1db87eaf56b4fa042ed021a280dea46ba6419d62 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Sun, 9 Jul 2023 16:35:22 -0500
Subject: [PATCH 13/15] Remove debug code

---
 src/map/include/computeMap.hpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 4bb8beb..82bb71c 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -1177,19 +1177,6 @@ namespace skch
 
             l2_vec.clear();
             computeL2MappedRegions(Q, candidateLocus, l2_vec);
-            //if (l2_vec.size() > 1)
-            //{
-              //std::cerr <<  candidateLocus.seqId << ":"
-               //<< candidateLocus.rangeStartPos << "--" << candidateLocus.rangeEndPos  << " == " << candidateLocus.rangeEndPos - candidateLocus.rangeStartPos
-               //<< std::endl;
-              //std::cerr << candidateLocus.intersectionSize
-               //<< "-->" << sketchCutoffs[candidateLocus.intersectionSize] << std::endl;
-              //for (const auto& l2 : l2_vec)
-              //{
-                //std::cerr << l2.optimalStart << "--" << l2.optimalEnd << "\t" << l2.sharedSketchSize << std::endl;  
-              //}
-              //std::cerr << std::endl;
-            //}
 
             for (auto& l2 : l2_vec) 
             {

From 0ce0c7d7f70ed248a6cdc94f9b78ba99ff82bba1 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Mon, 10 Jul 2023 12:41:14 -0500
Subject: [PATCH 14/15] Add kmer complexity filter

---
 src/map/include/base_types.hpp     |  4 ++--
 src/map/include/computeMap.hpp     | 15 ++++++++-------
 src/map/include/map_parameters.hpp |  1 +
 src/map/include/parseCmdArgs.hpp   | 11 +++++++++++
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
index 741ef1c..c6e44c5 100644
--- a/src/map/include/base_types.hpp
+++ b/src/map/include/base_types.hpp
@@ -166,7 +166,7 @@ namespace skch
 
                                                         //--for split read mapping
 
-    long double seqComplexity;                               // Estimated sequence complexity
+    long double kmerComplexity;                               // Estimated sequence complexity
     int n_merged;                                       // how many mappings we've merged into this one
     offset_t splitMappingId;                            // To identify split mappings that are chained
     uint8_t discard;                                    // set to 1 for deletion
@@ -276,7 +276,7 @@ namespace skch
       MinmerVec minmerTableQuery;         //Vector of minmers in the query
       MinmerVec seedHits;                 //Vector of minmers in the reference
       int refGroup;                       //Prefix group of sequence
-      float seqComplexity;                //Estimated sequence complexity
+      float kmerComplexity;                //Estimated sequence complexity
     };
 }
 
diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp
index 82bb71c..e9df69f 100644
--- a/src/map/include/computeMap.hpp
+++ b/src/map/include/computeMap.hpp
@@ -804,7 +804,7 @@ namespace skch
           int orig_len = Q.minmerTableQuery.size();
 #endif
           const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
-          Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
+          Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);
 
           // TODO remove them from the original sketch instead of removing for each read
           auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
@@ -1191,8 +1191,9 @@ namespace skch
               // if we are in all-vs-all mode, it isn't a self-mapping,
               // and if we are self-mapping, the query is shorter than the target
               const auto& ref = this->refSketch.metadata[l2.seqId];
-              if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
-                  || nucIdentity >= param.percentageIdentity)
+              if((Q.kmerComplexity >= param.kmerComplexityThreshold)
+                  && ((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity)
+                  || nucIdentity >= param.percentageIdentity))
               {
                 //Track the best jaccard numerator
                 bestJaccardNumerator = std::max<double>(bestJaccardNumerator, l2.sharedSketchSize);
@@ -1215,7 +1216,7 @@ namespace skch
                   res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos);
                   res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0);
                   res.strand = l2.strand; 
-                  res.seqComplexity = Q.seqComplexity;
+                  res.kmerComplexity = Q.kmerComplexity;
 
                   res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len);
 
@@ -1528,8 +1529,8 @@ namespace skch
                                   [](double x, MappingResult &e){ return x + e.nucIdentity; })     )/ std::distance(it, it_end);
 
             //Mean sequence complexity of all mappings in the chain
-            it->seqComplexity = (   std::accumulate(it, it_end, 0.0,
-                                  [](double x, MappingResult &e){ return x + e.seqComplexity; })     )/ std::distance(it, it_end);
+            it->kmerComplexity = (   std::accumulate(it, it_end, 0.0,
+                                  [](double x, MappingResult &e){ return x + e.kmerComplexity; })     )/ std::distance(it, it_end);
 
             //Discard other mappings of this chain
             std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; });
@@ -1749,7 +1750,7 @@ namespace skch
                      << sep << e.blockLength
                      << sep << fakeMapQ
                      << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity
-                     << sep << "sc:f:" << e.seqComplexity;
+                     << sep << "kc:f:" << e.kmerComplexity;
             if (!param.mergeMappings) 
             {
               outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize;
diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp
index c2dada6..bdba569 100644
--- a/src/map/include/map_parameters.hpp
+++ b/src/map/include/map_parameters.hpp
@@ -60,6 +60,7 @@ struct Parameters
     bool keep_low_pct_id;                             //true if we should keep mappings whose estimated identity < percentageIdentity
     bool report_ANI_percentage;                       //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash
     bool filterLengthMismatches;                      //true if filtering out length mismatches
+    float kmerComplexityThreshold;                    //minimum kmer complexity to consider (default 0)
 
 
     int sketchSize;
diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp
index 7343d34..23904bc 100644
--- a/src/map/include/parseCmdArgs.hpp
+++ b/src/map/include/parseCmdArgs.hpp
@@ -95,6 +95,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
     cmd.defineOptionAlternative("kmer","k");
 
     cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
+    cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);
 
 
     //cmd.defineOption("shortenCandidateRegions", "Only compute rolling minhash score for small regions around positions where the intersection of reference and query minmers is locally maximal. Results in slighty faster runtimes at the cost of mapping placement and ANI prediction.");
@@ -510,6 +511,16 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
       parameters.percentageIdentity = 0.85;
     str.clear();
 
+    if(cmd.foundOption("kmerComplexity"))
+    {
+      str << cmd.optionValue("kmerComplexity");
+      str >> parameters.kmerComplexityThreshold;
+      str.clear();
+    }
+    else
+      parameters.kmerComplexityThreshold = 0.0;
+    str.clear();
+
 
     if (cmd.foundOption("hgFilterAniDiff")) {
       str << cmd.optionValue("hgFilterAniDiff");

From b440e04a3cc21fcbe00f97cfd702a8737384c840 Mon Sep 17 00:00:00 2001
From: Bryce Lorenz Kille <brycekille@gmail.com>
Date: Mon, 10 Jul 2023 12:47:26 -0500
Subject: [PATCH 15/15] Make progcontainer a child class

---
 src/map/include/base_types.hpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp
index c6e44c5..3bdc4ed 100644
--- a/src/map/include/base_types.hpp
+++ b/src/map/include/base_types.hpp
@@ -225,12 +225,9 @@ namespace skch
           , seqName(id) { }
   };
 
-  struct InputSeqProgContainer
+  struct InputSeqProgContainer : InputSeqContainer
   {
-    seqno_t seqCounter;                         //sequence counter
-    offset_t len;                               //sequence length
-    std::string seq;                            //sequence string
-    std::string seqName;                        //sequence id
+    using InputSeqContainer::InputSeqContainer;
     progress_meter::ProgressMeter& progress;    //progress meter (shared)
                                                 
 
@@ -241,10 +238,7 @@ namespace skch
      * @param[in] len       length of sequence
      */
       InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm)
-          : seqCounter(seqcount)
-          , len(s.length())
-          , seq(s)
-          , seqName(id)
+          : InputSeqContainer(s, id, seqcount)
           , progress(pm) { }
   };