diff --git a/src/map/include/base_types.hpp b/src/map/include/base_types.hpp index 741ef1c..c6e44c5 100644 --- a/src/map/include/base_types.hpp +++ b/src/map/include/base_types.hpp @@ -166,7 +166,7 @@ namespace skch //--for split read mapping - long double seqComplexity; // Estimated sequence complexity + long double kmerComplexity; // Estimated sequence complexity int n_merged; // how many mappings we've merged into this one offset_t splitMappingId; // To identify split mappings that are chained uint8_t discard; // set to 1 for deletion @@ -276,7 +276,7 @@ namespace skch MinmerVec minmerTableQuery; //Vector of minmers in the query MinmerVec seedHits; //Vector of minmers in the reference int refGroup; //Prefix group of sequence - float seqComplexity; //Estimated sequence complexity + float kmerComplexity; //Estimated sequence complexity }; } diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 82bb71c..e9df69f 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -804,7 +804,7 @@ namespace skch int orig_len = Q.minmerTableQuery.size(); #endif const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits::max(); - Q.seqComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); + Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2); // TODO remove them from the original sketch instead of removing for each read auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) { @@ -1191,8 +1191,9 @@ namespace skch // if we are in all-vs-all mode, it isn't a self-mapping, // and if we are self-mapping, the query is shorter than the target const auto& ref = this->refSketch.metadata[l2.seqId]; - if((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity) - || nucIdentity >= param.percentageIdentity) + if((Q.kmerComplexity >= param.kmerComplexityThreshold) + && ((param.keep_low_pct_id && nucIdentityUpperBound >= param.percentageIdentity) + || nucIdentity >= param.percentageIdentity)) { //Track the best jaccard numerator bestJaccardNumerator = std::max(bestJaccardNumerator, l2.sharedSketchSize); @@ -1215,7 +1216,7 @@ namespace skch res.blockLength = std::max(res.refEndPos - res.refStartPos, res.queryEndPos - res.queryStartPos); res.approxMatches = std::round(res.nucIdentity * res.blockLength / 100.0); res.strand = l2.strand; - res.seqComplexity = Q.seqComplexity; + res.kmerComplexity = Q.kmerComplexity; res.selfMapFilter = ((param.skip_self || param.skip_prefix) && Q.fullLen > ref.len); @@ -1528,8 +1529,8 @@ namespace skch [](double x, MappingResult &e){ return x + e.nucIdentity; }) )/ std::distance(it, it_end); //Mean sequence complexity of all mappings in the chain - it->seqComplexity = ( std::accumulate(it, it_end, 0.0, - [](double x, MappingResult &e){ return x + e.seqComplexity; }) )/ std::distance(it, it_end); + it->kmerComplexity = ( std::accumulate(it, it_end, 0.0, + [](double x, MappingResult &e){ return x + e.kmerComplexity; }) )/ std::distance(it, it_end); //Discard other mappings of this chain std::for_each( std::next(it), it_end, [&](MappingResult &e){ e.discard = 1; }); @@ -1749,7 +1750,7 @@ namespace skch << sep << e.blockLength << sep << fakeMapQ << sep << "id:f:" << (param.report_ANI_percentage ? 100.0 : 1.0) * e.nucIdentity - << sep << "sc:f:" << e.seqComplexity; + << sep << "kc:f:" << e.kmerComplexity; if (!param.mergeMappings) { outstrm << sep << "jc:f:" << float(e.conservedSketches) / e.sketchSize; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index c2dada6..bdba569 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -60,6 +60,7 @@ struct Parameters bool keep_low_pct_id; //true if we should keep mappings whose estimated identity < percentageIdentity bool report_ANI_percentage; //true if ANI should be in [0,100] as opposed to [0,1] (this is necessary for wfmash bool filterLengthMismatches; //true if filtering out length mismatches + float kmerComplexityThreshold; //minimum kmer complexity to consider (default 0) int sketchSize; diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 7343d34..23904bc 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -95,6 +95,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOptionAlternative("kmer","k"); cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue); + cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue); //cmd.defineOption("shortenCandidateRegions", "Only compute rolling minhash score for small regions around positions where the intersection of reference and query minmers is locally maximal. Results in slighty faster runtimes at the cost of mapping placement and ANI prediction."); @@ -510,6 +511,16 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir parameters.percentageIdentity = 0.85; str.clear(); + if(cmd.foundOption("kmerComplexity")) + { + str << cmd.optionValue("kmerComplexity"); + str >> parameters.kmerComplexityThreshold; + str.clear(); + } + else + parameters.kmerComplexityThreshold = 0.0; + str.clear(); + if (cmd.foundOption("hgFilterAniDiff")) { str << cmd.optionValue("hgFilterAniDiff");