From 385edae652dddcc4777e4b63e3f9f0f5852dd958 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 11 Oct 2024 12:06:02 -0500 Subject: [PATCH] Allow forcing global query sketch --- src/map/include/computeMap.hpp | 17 ++++++++++++----- src/map/include/map_parameters.hpp | 1 + src/map/include/parseCmdArgs.hpp | 12 ++++++------ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index 65af4c0..2385804 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -818,15 +818,22 @@ namespace skch void getSeedHits(Q_Info &Q) { Q.minmerTableQuery.reserve(param.sketchSize + 1); - if (Q.len > 1.5 * param.segLength) + + // Use global sketch of param.sketchSize if + // (a) We are splitting the query into segments + // (b) We are not splitting the query, but we want unbiased Jaccard + // (c) We are not splitting the query, but it isn't even longer than param.segLength + if (param.split + || param.forceGlobalQuerySketch + || Q.len <= param.segLength) { - CommonFunc::addMinmers(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.segLength, param.alphabetSize, param.sketchSize, Q.seqCounter); - std::sort(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [](const MinmerInfo& l, const MinmerInfo& r) {return l.hash < r.hash;}); - Q.minmerTableQuery.erase(std::unique(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end()), Q.minmerTableQuery.end()); + CommonFunc::sketchSequence(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.alphabetSize, param.sketchSize, Q.seqCounter); } else { - CommonFunc::sketchSequence(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.alphabetSize, param.sketchSize, Q.seqCounter); + CommonFunc::addMinmers(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.segLength, param.alphabetSize, param.sketchSize, Q.seqCounter); + std::sort(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [](const MinmerInfo& l, const MinmerInfo& r) {return l.hash < r.hash;}); + Q.minmerTableQuery.erase(std::unique(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end()), Q.minmerTableQuery.end()); } if(Q.minmerTableQuery.size() == 0) { Q.sketchSize = 0; diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index fd0dacc..dcf4c41 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -54,6 +54,7 @@ struct Parameters stdfs::path saveIndexFilename; //output file name of index stdfs::path loadIndexFilename; //input file name of index bool split; //Split read mapping (done if this is true) + bool forceGlobalQuerySketch; //Use the minhash sketch for entire query even if --noSplit is used bool lower_triangular; // set to true if we should filter out half of the mappings bool skip_self; //skip self mappings bool skip_prefix; //skip mappings to sequences with the same prefix diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index 10b5cca..8118ef4 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -60,7 +60,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("sketchSize", "Number of sketch elements", ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("sketchSize","J"); - cmd.defineOption("dense", "Use dense sketching to yield higher ANI estimation accuracy. [disabled by default]"); + cmd.defineOption("dense", "Use dense sketching to yield higher ANI estimation accuracy."); cmd.defineOption("blockLength", "keep merged mappings supported by homologies of at least this length [default: segmentLength]", ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("blockLength", "l"); @@ -77,7 +77,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir cmd.defineOption("loadIndex", "Prefix of index files to load, where PREFIX.map and PREFIX.index are the files to be loaded", ArgvParser::OptionRequiresValue); - cmd.defineOption("noSplit", "disable splitting of input sequences during mapping [enabled by default]"); + cmd.defineOption("noSplit", "disable splitting of input sequences during mapping"); + cmd.defineOption("unbiasedNoSplit", "same as noSplit, but uses same sketch size for each query, regardless of length. This leads to unbiased Jaccard prediction and faster mappings at a potential loss in mapping accuracy."); cmd.defineOption("perc_identity", "threshold for identity [default : 85]", ArgvParser::OptionRequiresValue); cmd.defineOptionAlternative("perc_identity","pi"); @@ -416,16 +417,15 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir else parameters.filterMode = filter::MAP; - if(cmd.foundOption("noSplit")) + parameters.forceGlobalQuerySketch = cmd.foundOption("unbiasedNoSplit"); + parameters.split = !cmd.foundOption("noSplit") && !cmd.foundOption("unbiasedNoSplit"); + if(!parameters.split) { - parameters.split = false; if (parameters.stage1_topANI_filter) std::cerr << "WARNING, skch::parseandSave, hypergeometric filter is incompatible with --noSplit. Disabling hypergeometric filter." << std::endl; parameters.stage1_topANI_filter = false; } - else - parameters.split = true; if(cmd.foundOption("noMerge")) {