Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add max_containment to MinHash class. #1346

Merged
merged 41 commits into from
Mar 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ba21268
add max_containment to MinHash
ctb Feb 23, 2021
74e8e07
add max_containment to SourmashSignature
ctb Feb 23, 2021
7e1bdca
add initial scaffolding for max_containment
ctb Feb 23, 2021
8f73291
compute actual max containment
ctb Feb 23, 2021
402ca77
interim comments
ctb Feb 23, 2021
21d6fdb
ok, the basic logic should be laid out
ctb Feb 23, 2021
787764a
fix typo per tessa
ctb Feb 23, 2021
7d2dae3
more typo
ctb Feb 23, 2021
bed7110
cleanup and fixes
ctb Feb 23, 2021
e3b0f61
change implementation away from **kwargs
ctb Feb 23, 2021
9ffa5ca
update lca_db.search for max_containment
ctb Feb 23, 2021
3a22806
Merge branch 'latest' of github.com:dib-lab/sourmash into add/max_con…
ctb Feb 23, 2021
ae92d08
implement sourmash search --max-containment
ctb Feb 23, 2021
61e9888
add lca database for --max-containment test
ctb Feb 24, 2021
30df58d
fix some issues with identifiers in the LCA code
ctb Feb 24, 2021
f014cca
Merge branch 'latest' of github.com:dib-lab/sourmash into add/max_con…
ctb Feb 26, 2021
ba79e9c
fix duplicate test name
ctb Mar 5, 2021
d8d3657
test basic search, no SBT
ctb Mar 5, 2021
fc499d5
fix previously hidden test
ctb Mar 5, 2021
77f5407
Merge branch 'latest' into add/max_containment
ctb Mar 5, 2021
4810b00
Merge branch 'latest' of github.com:dib-lab/sourmash into add/max_con…
ctb Mar 11, 2021
c8cb293
Merge branch 'add/max_containment' of github.com:dib-lab/sourmash int…
ctb Mar 11, 2021
f6296ab
fix typo
ctb Mar 11, 2021
90ffc98
d'oh
ctb Mar 11, 2021
3fd9942
fix max_containment; remove results caching, add assert
ctb Mar 11, 2021
8703e89
remove unused whitespace
ctb Mar 11, 2021
34c1146
check for error if both --containment and --max-containment
ctb Mar 11, 2021
0e230ef
containment can only be calculated on scaled sigs
ctb Mar 12, 2021
1fa8cb7
test SBT.search requires threshold
ctb Mar 12, 2021
5827df6
add --max-containment to compare, along with tests
ctb Mar 12, 2021
813064b
unmask overwritten test & fix
ctb Mar 12, 2021
a583783
remove duplicated test function
ctb Mar 12, 2021
e5c67d8
test compare --max-containment
ctb Mar 12, 2021
bbd3898
add test for both --containment and --max-containment
ctb Mar 12, 2021
8e63268
produce friendlier error message in search
ctb Mar 12, 2021
356e934
grammar fix in comments
ctb Mar 12, 2021
cbd2503
improve CSV output for search, marginally
ctb Mar 12, 2021
a64714c
Update src/sourmash/commands.py
ctb Mar 12, 2021
c717184
Update src/sourmash/minhash.py
ctb Mar 12, 2021
18b4f78
fix test, add tests for empty sigs
ctb Mar 12, 2021
108748a
--best-only on SBTs is currently incompatible with containment
ctb Mar 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sourmash/cli/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ def subparser(subparsers):
'--containment', action='store_true',
help='calculate containment instead of similarity'
)
subparser.add_argument(
'--max-containment', action='store_true',
help='calculate max containment instead of similarity'
)
subparser.add_argument(
'--from-file',
help='a text file containing a list of files to load signatures from'
Expand Down
8 changes: 6 additions & 2 deletions src/sourmash/cli/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,16 @@ def subparser(subparsers):
)
subparser.add_argument(
'--containment', action='store_true',
help='evaluate containment rather than similarity'
help='score based on containment rather than similarity'
)
subparser.add_argument(
'--max-containment', action='store_true',
help='score based on max containment rather than similarity'
)
subparser.add_argument(
'--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present; note: has no effect if '
'--containment is specified'
'--containment or --max-containment is specified'
)
subparser.add_argument(
'--scaled', metavar='FLOAT', type=float, default=0,
Expand Down
40 changes: 31 additions & 9 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import sys

import screed
from .compare import compare_all_pairs, compare_serial_containment
from .compare import (compare_all_pairs, compare_serial_containment,
compare_serial_max_containment)
from . import MinHash
from .sbtmh import load_sbt_index, create_sbt_index
from . import signature as sig
Expand Down Expand Up @@ -91,16 +92,24 @@ def compare(args):
error('cannot mix scaled signatures with bounded signatures')
sys.exit(-1)

is_containment = False
if args.containment or args.max_containment:
is_containment = True

if args.containment and args.max_containment:
notify("ERROR: cannot specify both --containment and --max-containment!")
sys.exit(-1)

# complain if --containment and not is_scaled
if args.containment and not is_scaled:
error('must use scaled signatures with --containment option')
if is_containment and not is_scaled:
error('must use scaled signatures with --containment and --max-containment')
sys.exit(-1)

# notify about implicit --ignore-abundance:
if args.containment:
if is_containment:
track_abundances = any(( s.minhash.track_abundance for s in siglist ))
if track_abundances:
notify('NOTE: --containment means signature abundances are flattened.')
notify('NOTE: --containment and --max-containment ignore signature abundances.')

# if using --scaled, downsample appropriately
printed_scaled_msg = False
Expand All @@ -127,6 +136,8 @@ def compare(args):
labeltext = [str(item) for item in siglist]
if args.containment:
similarity = compare_serial_containment(siglist)
elif args.max_containment:
similarity = compare_serial_max_containment(siglist)
else:
similarity = compare_all_pairs(siglist, args.ignore_abundance,
n_jobs=args.processes)
Expand Down Expand Up @@ -437,8 +448,14 @@ def search(args):
query.minhash = query.minhash.downsample(scaled=args.scaled)

# set up the search databases
is_containment = args.containment or args.max_containment
if is_containment:
if args.containment and args.max_containment:
notify("ERROR: cannot specify both --containment and --max-containment!")
sys.exit(-1)

databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
not args.containment)
not is_containment)

# forcibly ignore abundances if query has no abundances
if not query.minhash.track_abundance:
Expand All @@ -450,8 +467,11 @@ def search(args):

# do the actual search
results = search_databases(query, databases,
args.threshold, args.containment,
args.best_only, args.ignore_abundance,
threshold=args.threshold,
do_containment=args.containment,
do_max_containment=args.max_containment,
best_only=args.best_only,
ignore_abundance=args.ignore_abundance,
unload_data=True)

n_matches = len(results)
Expand All @@ -477,7 +497,8 @@ def search(args):
notify("** reporting only one match because --best-only was set")

if args.output:
fieldnames = ['similarity', 'name', 'filename', 'md5']
fieldnames = ['similarity', 'name', 'filename', 'md5',
'query_filename', 'query_name', 'query_md5']
Comment on lines +500 to +501
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to modify similarity to containment / max_containment for csv output. Thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

semantic versioning prevents us from removing the similarity header before 5.0. we could add new columns, I 'spose. I don't like the idea that column headers change depending on command line arguments, though. Not sure how to think about it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(suggest we punt this to a new issue and discuss it there.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

punted to #1390


with FileOutput(args.output, 'wt') as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
Expand All @@ -486,6 +507,7 @@ def search(args):
for sr in results:
d = dict(sr._asdict())
del d['match']
del d['query']
w.writerow(d)

# save matching signatures upon request
Expand Down
24 changes: 23 additions & 1 deletion src/sourmash/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def compare_serial(siglist, ignore_abundance, downsample=False):
def compare_serial_containment(siglist, downsample=False):
"""Compare all combinations of signatures and return a matrix
of containments. Processes combinations serially on a single
process. Best to use when there is few signatures.
process. Best to only use when there are few signatures.

:param list siglist: list of signatures to compare
:param boolean downsample by scaled if True
Expand All @@ -61,6 +61,28 @@ def compare_serial_containment(siglist, downsample=False):
return containments


def compare_serial_max_containment(siglist, downsample=False):
"""Compare all combinations of signatures and return a matrix
of max_containments. Processes combinations serially on a single
process. Best to only use when there are few signatures.

:param list siglist: list of signatures to compare
:param boolean downsample by scaled if True
:return: np.array similarity matrix
"""
import numpy as np

n = len(siglist)

containments = np.ones((n, n))
for i in range(n):
for j in range(n):
containments[i][j] = siglist[j].max_containment(siglist[i],
downsample=downsample)

return containments


def similarity_args_unpack(args, ignore_abundance, downsample):
"""Helper function to unpack the arguments. Written to use in pool.imap
as it can only be given one argument."""
Expand Down
20 changes: 12 additions & 8 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def find(self, search_fn, *args, **kwargs):
matches.append(node)
return matches

def search(self, query, *args, **kwargs):
def search(self, query, threshold=None,
do_containment=False, do_max_containment=False,
ignore_abundance=False, **kwargs):
"""Return set of matches with similarity above 'threshold'.

Results will be sorted by similarity, highest to lowest.
Expand All @@ -55,16 +57,18 @@ def search(self, query, *args, **kwargs):
"""

# check arguments
if 'threshold' not in kwargs:
if threshold is None:
raise TypeError("'search' requires 'threshold'")
threshold = kwargs['threshold']
threshold = float(threshold)

do_containment = kwargs.get('do_containment', False)
ignore_abundance = kwargs.get('ignore_abundance', False)
if do_containment and do_max_containment:
raise TypeError("'do_containment' and 'do_max_containment' cannot both be True")

# configure search - containment? ignore abundance?
if do_containment:
query_match = lambda x: query.contained_by(x, downsample=True)
elif do_max_containment:
query_match = lambda x: query.max_containment(x, downsample=True)
else:
query_match = lambda x: query.similarity(
x, downsample=True, ignore_abundance=ignore_abundance)
Expand All @@ -73,9 +77,9 @@ def search(self, query, *args, **kwargs):
matches = []

for ss in self.signatures():
similarity = query_match(ss)
if similarity >= threshold:
matches.append((similarity, ss, self.filename))
score = query_match(ss)
if score >= threshold:
matches.append((score, ss, self.filename))

# sort!
matches.sort(key=lambda x: -x[0])
Expand Down
21 changes: 13 additions & 8 deletions src/sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ def save(self, db_name):

json.dump(save_d, fp)

def search(self, query, *args, **kwargs):
def search(self, query, threshold=None, do_containment=False,
do_max_containment=False, ignore_abundance=False, **kwargs):
"""Return set of matches with similarity above 'threshold'.

Results will be sorted by similarity, highest to lowest.
Expand All @@ -319,18 +320,18 @@ def search(self, query, *args, **kwargs):
return []

# check arguments
if 'threshold' not in kwargs:
if threshold is None:
raise TypeError("'search' requires 'threshold'")
threshold = kwargs['threshold']
do_containment = kwargs.get('do_containment', False)
ignore_abundance = kwargs.get('ignore_abundance', False)
threshold = float(threshold)

mh = query.minhash
if ignore_abundance:
mh.track_abundance = False

# find all the matches, then sort & return.
results = []
for x in self._find_signatures(mh, threshold, do_containment):
for x in self._find_signatures(mh, threshold, do_containment,
do_max_containment):
(score, match, filename) = x
results.append((score, match, filename))

Expand Down Expand Up @@ -455,7 +456,8 @@ def _signatures(self):
return sigd

def _find_signatures(self, minhash, threshold, containment=False,
ignore_scaled=False):
max_containment=False,
ignore_scaled=False):
"""
Do a Jaccard similarity or containment search, yield results.

Expand All @@ -467,7 +469,7 @@ def _find_signatures(self, minhash, threshold, containment=False,
if self.scaled > minhash.scaled:
minhash = minhash.downsample(scaled=self.scaled)
elif self.scaled < minhash.scaled and not ignore_scaled:
# note that containment can be calculated w/o matching scaled.
# note that containment cannot be calculated w/o matching scaled.
raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))

query_mins = set(minhash.hashes)
Expand All @@ -494,6 +496,9 @@ def _find_signatures(self, minhash, threshold, containment=False,
# calculate the containment or similarity
if containment:
score = count / len(query_mins)
elif max_containment:
denom = min((len(query_mins), match_size))
score = count / denom
else:
# query_mins is size of query signature
# match_size is size of match signature
Expand Down
12 changes: 12 additions & 0 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,18 @@ def contained_by(self, other, downsample=False):

return self.count_common(other, downsample) / len(self)

def max_containment(self, other, downsample=False):
"""
Calculate maximum containment.
"""
if not (self.scaled and other.scaled):
raise TypeError("can only calculate containment for scaled MinHashes")
min_denom = min((len(self), len(other)))
if not min_denom:
return 0.0
ctb marked this conversation as resolved.
Show resolved Hide resolved

return self.count_common(other, downsample) / min_denom

def __add__(self, other):
if not isinstance(other, MinHash):
raise TypeError("can only add MinHash objects to MinHash objects!")
Expand Down
31 changes: 23 additions & 8 deletions src/sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,10 @@ def find(self, search_fn, *args, **kwargs):

return matches

def search(self, query, *args, **kwargs):
def search(self, query, threshold=None,
ignore_abundance=False, do_containment=False,
do_max_containment=False, best_only=False,
unload_data=False, **kwargs):
"""Return set of matches with similarity above 'threshold'.

Results will be sorted by similarity, highest to lowest.
Expand All @@ -350,15 +353,17 @@ def search(self, query, *args, **kwargs):
* ignore_abundance: default False. If True, and query signature
and database support k-mer abundances, ignore those abundances.
"""
from .sbtmh import search_minhashes, search_minhashes_containment
from .sbtmh import (search_minhashes, search_minhashes_containment,
search_minhashes_max_containment)
from .sbtmh import SearchMinHashesFindBest
from .signature import SourmashSignature

threshold = kwargs['threshold']
ignore_abundance = kwargs.get('ignore_abundance', False)
do_containment = kwargs.get('do_containment', False)
best_only = kwargs.get('best_only', False)
unload_data = kwargs.get('unload_data', False)
if threshold is None:
raise TypeError("'search' requires 'threshold'")
threshold = float(threshold)

if do_containment and do_max_containment:
raise TypeError("'do_containment' and 'do_max_containment' cannot both be True")

# figure out scaled value of tree, downsample query if needed.
leaf = next(iter(self.leaves()))
Expand All @@ -378,13 +383,23 @@ def search(self, query, *args, **kwargs):
if do_containment:
search_fn = search_minhashes_containment
query_match = lambda x: tree_query.contained_by(x, downsample=True)
elif do_max_containment:
search_fn = search_minhashes_max_containment
query_match = lambda x: tree_query.max_containment(x,
downsample=True)

if best_only: # this needs to be reset for each SBT
if do_containment or do_max_containment:
raise TypeError("'best_only' is incompatible with 'do_containment' and 'do_max_containment'")
search_fn = SearchMinHashesFindBest().search
ctb marked this conversation as resolved.
Show resolved Hide resolved

# now, search!
results = []
for leaf in self.find(search_fn, tree_query, threshold, unload_data=unload_data):

# here, self.find is used only to find candidate nodes;
for leaf in self.find(search_fn, tree_query, threshold,
unload_data=unload_data):
# the actual calculation of node match is done here:
similarity = query_match(leaf.data)

# tree search should always/only return matches above threshold
Expand Down
Loading