Skip to content

Commit

Permalink
Allow forcing global query sketch
Browse files Browse the repository at this point in the history
  • Loading branch information
bkille committed Oct 11, 2024
1 parent 51215f5 commit 385edae
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 11 deletions.
17 changes: 12 additions & 5 deletions src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -818,15 +818,22 @@ namespace skch
void getSeedHits(Q_Info &Q)
{
Q.minmerTableQuery.reserve(param.sketchSize + 1);
if (Q.len > 1.5 * param.segLength)

// Use global sketch of param.sketchSize if
// (a) We are splitting the query into segments
// (b) We are not splitting the query, but we want unbiased Jaccard
// (c) We are not splitting the query, but it isn't even longer than param.segLength
if (param.split
|| param.forceGlobalQuerySketch
|| Q.len <= param.segLength)
{
CommonFunc::addMinmers(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.segLength, param.alphabetSize, param.sketchSize, Q.seqCounter);
std::sort(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [](const MinmerInfo& l, const MinmerInfo& r) {return l.hash < r.hash;});
Q.minmerTableQuery.erase(std::unique(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end()), Q.minmerTableQuery.end());
CommonFunc::sketchSequence(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.alphabetSize, param.sketchSize, Q.seqCounter);
}
else
{
CommonFunc::sketchSequence(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.alphabetSize, param.sketchSize, Q.seqCounter);
CommonFunc::addMinmers(Q.minmerTableQuery, Q.seq, Q.len, param.kmerSize, param.segLength, param.alphabetSize, param.sketchSize, Q.seqCounter);
std::sort(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [](const MinmerInfo& l, const MinmerInfo& r) {return l.hash < r.hash;});
Q.minmerTableQuery.erase(std::unique(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end()), Q.minmerTableQuery.end());
}
if(Q.minmerTableQuery.size() == 0) {
Q.sketchSize = 0;
Expand Down
1 change: 1 addition & 0 deletions src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct Parameters
stdfs::path saveIndexFilename; //output file name of index
stdfs::path loadIndexFilename; //input file name of index
bool split; //Split read mapping (done if this is true)
bool forceGlobalQuerySketch; //Use the minhash sketch for entire query even if --noSplit is used
bool lower_triangular; // set to true if we should filter out half of the mappings
bool skip_self; //skip self mappings
bool skip_prefix; //skip mappings to sequences with the same prefix
Expand Down
12 changes: 6 additions & 6 deletions src/map/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
cmd.defineOption("sketchSize", "Number of sketch elements", ArgvParser::OptionRequiresValue);
cmd.defineOptionAlternative("sketchSize","J");

cmd.defineOption("dense", "Use dense sketching to yield higher ANI estimation accuracy. [disabled by default]");
cmd.defineOption("dense", "Use dense sketching to yield higher ANI estimation accuracy.");

cmd.defineOption("blockLength", "keep merged mappings supported by homologies of at least this length [default: segmentLength]", ArgvParser::OptionRequiresValue);
cmd.defineOptionAlternative("blockLength", "l");
Expand All @@ -77,7 +77,8 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
cmd.defineOption("loadIndex", "Prefix of index files to load, where PREFIX.map and PREFIX.index are the files to be loaded", ArgvParser::OptionRequiresValue);


cmd.defineOption("noSplit", "disable splitting of input sequences during mapping [enabled by default]");
cmd.defineOption("noSplit", "disable splitting of input sequences during mapping");
cmd.defineOption("unbiasedNoSplit", "same as noSplit, but uses same sketch size for each query, regardless of length. This leads to unbiased Jaccard prediction and faster mappings at a potential loss in mapping accuracy.");

cmd.defineOption("perc_identity", "threshold for identity [default : 85]", ArgvParser::OptionRequiresValue);
cmd.defineOptionAlternative("perc_identity","pi");
Expand Down Expand Up @@ -416,16 +417,15 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
else
parameters.filterMode = filter::MAP;

if(cmd.foundOption("noSplit"))
parameters.forceGlobalQuerySketch = cmd.foundOption("unbiasedNoSplit");
parameters.split = !cmd.foundOption("noSplit") && !cmd.foundOption("unbiasedNoSplit");
if(!parameters.split)
{
parameters.split = false;
if (parameters.stage1_topANI_filter)
std::cerr << "WARNING, skch::parseandSave, hypergeometric filter is incompatible with --noSplit. Disabling hypergeometric filter." << std::endl;

parameters.stage1_topANI_filter = false;
}
else
parameters.split = true;

if(cmd.foundOption("noMerge"))
{
Expand Down

0 comments on commit 385edae

Please sign in to comment.