Skip to content

Commit

Permalink
MB-61640: Fuzzy Dynamic Scoring (#2056)
Browse files Browse the repository at this point in the history
- Added levenshtein distance calculation for fuzzy and wildcard
searchers
- Added new implementations of certain functions to allow passing of
edit distances per term
 - Multiplied boosts by inverse of edit distance for score calculation

---------

Co-authored-by: Abhinav Dangeti <[email protected]>
  • Loading branch information
Likith101 and abhinavdangeti authored Nov 21, 2024
1 parent 3a21667 commit 38f1737
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 14 deletions.
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/bits-and-blooms/bitset v1.12.0
github.com/blevesearch/bleve_index_api v1.1.12
github.com/blevesearch/bleve_index_api v1.1.13
github.com/blevesearch/geo v0.1.20
github.com/blevesearch/go-faiss v1.0.23
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
Expand All @@ -18,13 +18,13 @@ require (
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/stempel v0.2.0
github.com/blevesearch/upsidedown_store_api v1.0.2
github.com/blevesearch/vellum v1.0.10
github.com/blevesearch/vellum v1.0.11
github.com/blevesearch/zapx/v11 v11.3.10
github.com/blevesearch/zapx/v12 v12.3.10
github.com/blevesearch/zapx/v13 v13.3.10
github.com/blevesearch/zapx/v14 v14.3.10
github.com/blevesearch/zapx/v15 v15.3.16
github.com/blevesearch/zapx/v16 v16.1.8
github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2
github.com/couchbase/moss v0.2.0
github.com/golang/protobuf v1.3.2
github.com/spf13/cobra v1.7.0
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY=
github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48=
github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.23 h1:Wmc5AFwDLKGl2L6mjLX1Da3vCL0EKa2uHHSorcIS1Uc=
Expand Down Expand Up @@ -31,8 +31,8 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB
github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc=
github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A=
github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI=
github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k=
github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU=
github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk=
github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ=
github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s=
Expand All @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE=
github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg=
github.com/blevesearch/zapx/v16 v16.1.8 h1:Bxzpw6YQpFs7UjoCV1+RvDw6fmAT2GZxldwX8b3wVBM=
github.com/blevesearch/zapx/v16 v16.1.8/go.mod h1:JqQlOqlRVaYDkpLIl3JnKql8u4zKTNlVEa3nLsi0Gn8=
github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2 h1:+RX9SM7KO7q91E7rFj4NARSsAhKj2EbvdWfzX+ihg/w=
github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2/go.mod h1:zuxVgVaLZ0g4lZvrv06xDc24N6nLCOzXYHVkXI7LMHM=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down
59 changes: 59 additions & 0 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2953,3 +2953,62 @@ func TestCopyIndex(t *testing.T) {
}
}
}

func TestFuzzyScoring(t *testing.T) {
tmpIndexPath := createTmpIndexPath(t)
defer cleanupTmpIndexPath(t, tmpIndexPath)

mp := NewIndexMapping()
mp.DefaultAnalyzer = "simple"
idx, err := New(tmpIndexPath, mp)
if err != nil {
t.Fatal(err)
}

batch := idx.NewBatch()

docs := []map[string]interface{}{
{
"textField": "ab",
},
{
"textField": "abc",
},
{
"textField": "abcd",
},
}

for _, doc := range docs {
err := batch.Index(fmt.Sprintf("%v", doc["textField"]), doc)
if err != nil {
t.Fatal(err)
}
}

err = idx.Batch(batch)
if err != nil {
t.Fatal(err)
}

query := NewFuzzyQuery("ab")
query.Fuzziness = 2
searchRequest := NewSearchRequestOptions(query, 10, 0, true)
res, err := idx.Search(searchRequest)
if err != nil {
t.Error(err)
}

maxScore := res.Hits[0].Score

for i, hit := range res.Hits {
if maxScore/float64(i+1) != hit.Score {
t.Errorf("expected score - %f, got score - %f", maxScore/float64(i+1), hit.Score)
}
}

err = idx.Close()
if err != nil {
t.Fatal(err)
}
}
16 changes: 11 additions & 5 deletions search/searcher/search_fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,11 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}

var candidates []string
var editDistances []uint8
var dictBytesRead uint64
if fuzzyCandidates != nil {
candidates = fuzzyCandidates.candidates
editDistances = fuzzyCandidates.editDistances
dictBytesRead = fuzzyCandidates.bytesRead
}

Expand All @@ -93,8 +95,8 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}
}

return NewMultiTermSearcher(ctx, indexReader, candidates, field,
boost, options, true)
return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
boost, editDistances, options, true)
}

func getAutoFuzziness(term string) int {
Expand All @@ -113,8 +115,9 @@ func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, te
}

type fuzzyCandidates struct {
candidates []string
bytesRead uint64
candidates []string
editDistances []uint8
bytesRead uint64
}

func reportIOStats(ctx context.Context, bytesRead uint64) {
Expand All @@ -132,7 +135,8 @@ func reportIOStats(ctx context.Context, bytesRead uint64) {
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) {
rv = &fuzzyCandidates{
candidates: make([]string, 0),
candidates: make([]string, 0),
editDistances: make([]uint8, 0),
}

// in case of advanced reader implementations directly call
Expand All @@ -151,6 +155,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, tfd.EditDistance)
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand Down Expand Up @@ -185,6 +190,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
if !exceeded && ld <= fuzziness {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, uint8(ld))
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand Down
51 changes: 51 additions & 0 deletions search/searcher/search_multi_term.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,31 @@ func NewMultiTermSearcher(ctx context.Context, indexReader index.IndexReader, te
options, limit)
}

// Works similarly to the multi term searcher but additionally boosts individual terms based on
// their edit distance from the query terms
func NewMultiTermSearcherBoosted(ctx context.Context, indexReader index.IndexReader, terms []string,
field string, boost float64, editDistances []uint8, options search.SearcherOptions, limit bool) (
search.Searcher, error) {

if tooManyClauses(len(terms)) {
if optionsDisjunctionOptimizable(options) {
return optimizeMultiTermSearcher(ctx, indexReader, terms, field, boost, options)
}
if limit {
return nil, tooManyClausesErr(field, len(terms))
}
}

qsearchers, err := makeBatchSearchersBoosted(ctx, indexReader, terms, field, boost, editDistances, options)
if err != nil {
return nil, err
}

// build disjunction searcher of these ranges
return newMultiTermSearcherInternal(ctx, indexReader, qsearchers, field, boost,
options, limit)
}

func NewMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions, limit bool) (
search.Searcher, error) {
Expand Down Expand Up @@ -151,6 +176,32 @@ func makeBatchSearchers(ctx context.Context, indexReader index.IndexReader, term
return qsearchers, nil
}

func makeBatchSearchersBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, field string,
boost float64, editDistances []uint8, options search.SearcherOptions) ([]search.Searcher, error) {

qsearchers := make([]search.Searcher, len(terms))
qsearchersClose := func() {
for _, searcher := range qsearchers {
if searcher != nil {
_ = searcher.Close()
}
}
}
for i, term := range terms {
var err error
var editMultiplier float64
if editDistances != nil {
editMultiplier = 1 / float64(editDistances[i]+1)
}
qsearchers[i], err = NewTermSearcher(ctx, indexReader, term, field, boost*editMultiplier, options)
if err != nil {
qsearchersClose()
return nil, err
}
}
return qsearchers, nil
}

func optimizeMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
Expand Down

0 comments on commit 38f1737

Please sign in to comment.