Skip to content

Commit

Permalink
MB-19243: Detect fuzziness automatically based on term length (#2060)
Browse files Browse the repository at this point in the history
- The following queries can now automatically detect fuzziness based on
the length of the terms:
  - Match Query
  - Fuzzy Query
  - Match-Phrase Query
  - Multi-Phrase Query
  - Phrase Query
- In these queries, each term (whether in a multi-term query like Match
or Phrase, or in a single-term query like Fuzzy can have its own edit
distance based on its length. The edit distance is calculated as
follows:
  - For terms with 1 or 2 characters: edit distance = 0 (exact match)
- For terms with 3, 4, or 5 characters: edit distance = 1 (fuzzy match)
- For terms with more than 5 characters: edit distance = 2 (fuzzy match)
- This feature can be enabled using the
`<query>.SetAutoFuzziness(<bool>)` API.
- Additionally, we've extended the functionality to query JSON parsing.
You can specify fuzziness as either "auto" or a static value in the JSON
query. Both formats are valid:
1. With auto fuzziness:
```
{
  "match" : "lorem",
  "field" : "bleve"
  "fuzziness" : "auto"
}
```
2. With static fuzziness:
```
{
  "match" : "lorem",
  "field" : "bleve"
  "fuzziness" : 2
}
```
When unmarshalled, the query will correctly apply the chosen fuzziness
method.
- Fixed a bug where the code incorrectly returned an error message
saying `fuzziness exceeds maximum` when using a fuzzy searcher with
`fuzziness = 0`. Instead, a term searcher is now returned in this case.
  • Loading branch information
CascadingRadium authored Nov 20, 2024
1 parent d002624 commit 3a21667
Show file tree
Hide file tree
Showing 9 changed files with 514 additions and 32 deletions.
55 changes: 55 additions & 0 deletions search/query/fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/search/searcher"
"github.com/blevesearch/bleve/v2/util"
index "github.com/blevesearch/bleve_index_api"
)

Expand All @@ -29,6 +30,7 @@ type FuzzyQuery struct {
Fuzziness int `json:"fuzziness"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
autoFuzzy bool
}

// NewFuzzyQuery creates a new Query which finds
Expand Down Expand Up @@ -66,6 +68,10 @@ func (q *FuzzyQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *FuzzyQuery) SetAutoFuzziness(a bool) {
q.autoFuzzy = a
}

func (q *FuzzyQuery) SetPrefix(p int) {
q.Prefix = p
}
Expand All @@ -75,5 +81,54 @@ func (q *FuzzyQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin
if q.FieldVal == "" {
field = m.DefaultSearchField()
}
if q.autoFuzzy {
return searcher.NewAutoFuzzySearcher(ctx, i, q.Term, q.Prefix, field, q.BoostVal.Value(), options)
}
return searcher.NewFuzzySearcher(ctx, i, q.Term, q.Prefix, q.Fuzziness, field, q.BoostVal.Value(), options)
}

func (q *FuzzyQuery) UnmarshalJSON(data []byte) error {
type Alias FuzzyQuery
aux := &struct {
Fuzziness interface{} `json:"fuzziness"`
*Alias
}{
Alias: (*Alias)(q),
}
if err := util.UnmarshalJSON(data, &aux); err != nil {
return err
}
switch v := aux.Fuzziness.(type) {
case float64:
q.Fuzziness = int(v)
case string:
if v == "auto" {
q.autoFuzzy = true
}
}
return nil
}

func (f *FuzzyQuery) MarshalJSON() ([]byte, error) {
var fuzzyValue interface{}
if f.autoFuzzy {
fuzzyValue = "auto"
} else {
fuzzyValue = f.Fuzziness
}
type fuzzyQuery struct {
Term string `json:"term"`
Prefix int `json:"prefix_length"`
Fuzziness interface{} `json:"fuzziness"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
}
aux := fuzzyQuery{
Term: f.Term,
Prefix: f.Prefix,
Fuzziness: fuzzyValue,
FieldVal: f.FieldVal,
BoostVal: f.BoostVal,
}
return util.MarshalJSON(aux)
}
63 changes: 61 additions & 2 deletions search/query/match.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ type MatchQuery struct {
Prefix int `json:"prefix_length"`
Fuzziness int `json:"fuzziness"`
Operator MatchQueryOperator `json:"operator,omitempty"`
autoFuzzy bool
}

type MatchQueryOperator int
Expand Down Expand Up @@ -107,6 +108,10 @@ func (q *MatchQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MatchQuery) SetAutoFuzziness(auto bool) {
q.autoFuzzy = auto
}

func (q *MatchQuery) SetPrefix(p int) {
q.Prefix = p
}
Expand Down Expand Up @@ -138,10 +143,14 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin
if len(tokens) > 0 {

tqs := make([]Query, len(tokens))
if q.Fuzziness != 0 {
if q.Fuzziness != 0 || q.autoFuzzy {
for i, token := range tokens {
query := NewFuzzyQuery(string(token.Term))
query.SetFuzziness(q.Fuzziness)
if q.autoFuzzy {
query.SetAutoFuzziness(true)
} else {
query.SetFuzziness(q.Fuzziness)
}
query.SetPrefix(q.Prefix)
query.SetField(field)
query.SetBoost(q.BoostVal.Value())
Expand Down Expand Up @@ -175,3 +184,53 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin
noneQuery := NewMatchNoneQuery()
return noneQuery.Searcher(ctx, i, m, options)
}

func (q *MatchQuery) UnmarshalJSON(data []byte) error {
type Alias MatchQuery
aux := &struct {
Fuzziness interface{} `json:"fuzziness"`
*Alias
}{
Alias: (*Alias)(q),
}
if err := util.UnmarshalJSON(data, &aux); err != nil {
return err
}
switch v := aux.Fuzziness.(type) {
case float64:
q.Fuzziness = int(v)
case string:
if v == "auto" {
q.autoFuzzy = true
}
}
return nil
}

func (f *MatchQuery) MarshalJSON() ([]byte, error) {
var fuzzyValue interface{}
if f.autoFuzzy {
fuzzyValue = "auto"
} else {
fuzzyValue = f.Fuzziness
}
type match struct {
Match string `json:"match"`
FieldVal string `json:"field,omitempty"`
Analyzer string `json:"analyzer,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Prefix int `json:"prefix_length"`
Fuzziness interface{} `json:"fuzziness"`
Operator MatchQueryOperator `json:"operator,omitempty"`
}
aux := match{
Match: f.Match,
FieldVal: f.FieldVal,
Analyzer: f.Analyzer,
BoostVal: f.BoostVal,
Prefix: f.Prefix,
Fuzziness: fuzzyValue,
Operator: f.Operator,
}
return util.MarshalJSON(aux)
}
58 changes: 57 additions & 1 deletion search/query/match_phrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/util"
index "github.com/blevesearch/bleve_index_api"
)

Expand All @@ -30,6 +31,7 @@ type MatchPhraseQuery struct {
Analyzer string `json:"analyzer,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
autoFuzzy bool
}

// NewMatchPhraseQuery creates a new Query object
Expand Down Expand Up @@ -63,6 +65,10 @@ func (q *MatchPhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MatchPhraseQuery) SetAutoFuzziness(auto bool) {
q.autoFuzzy = auto
}

func (q *MatchPhraseQuery) Field() string {
return q.FieldVal
}
Expand All @@ -89,7 +95,11 @@ func (q *MatchPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m
phrase := tokenStreamToPhrase(tokens)
phraseQuery := NewMultiPhraseQuery(phrase, field)
phraseQuery.SetBoost(q.BoostVal.Value())
phraseQuery.SetFuzziness(q.Fuzziness)
if q.autoFuzzy {
phraseQuery.SetAutoFuzziness(true)
} else {
phraseQuery.SetFuzziness(q.Fuzziness)
}
return phraseQuery.Searcher(ctx, i, m, options)
}
noneQuery := NewMatchNoneQuery()
Expand Down Expand Up @@ -118,3 +128,49 @@ func tokenStreamToPhrase(tokens analysis.TokenStream) [][]string {
}
return nil
}

func (q *MatchPhraseQuery) UnmarshalJSON(data []byte) error {
type Alias MatchPhraseQuery
aux := &struct {
Fuzziness interface{} `json:"fuzziness"`
*Alias
}{
Alias: (*Alias)(q),
}
if err := util.UnmarshalJSON(data, &aux); err != nil {
return err
}
switch v := aux.Fuzziness.(type) {
case float64:
q.Fuzziness = int(v)
case string:
if v == "auto" {
q.autoFuzzy = true
}
}
return nil
}

func (f *MatchPhraseQuery) MarshalJSON() ([]byte, error) {
var fuzzyValue interface{}
if f.autoFuzzy {
fuzzyValue = "auto"
} else {
fuzzyValue = f.Fuzziness
}
type matchPhrase struct {
MatchPhrase string `json:"match_phrase"`
FieldVal string `json:"field,omitempty"`
Analyzer string `json:"analyzer,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness interface{} `json:"fuzziness"`
}
aux := matchPhrase{
MatchPhrase: f.MatchPhrase,
FieldVal: f.FieldVal,
Analyzer: f.Analyzer,
BoostVal: f.BoostVal,
Fuzziness: fuzzyValue,
}
return util.MarshalJSON(aux)
}
53 changes: 44 additions & 9 deletions search/query/multi_phrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type MultiPhraseQuery struct {
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
autoFuzzy bool
}

// NewMultiPhraseQuery creates a new Query for finding
Expand All @@ -52,6 +53,10 @@ func (q *MultiPhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MultiPhraseQuery) SetAutoFuzziness(auto bool) {
q.autoFuzzy = auto
}

func (q *MultiPhraseQuery) SetBoost(b float64) {
boost := Boost(b)
q.BoostVal = &boost
Expand All @@ -70,7 +75,7 @@ func (q *MultiPhraseQuery) SetField(f string) {
}

func (q *MultiPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.FieldVal, q.BoostVal.Value(), options)
return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.autoFuzzy, q.FieldVal, q.BoostVal.Value(), options)
}

func (q *MultiPhraseQuery) Validate() error {
Expand All @@ -81,15 +86,45 @@ func (q *MultiPhraseQuery) Validate() error {
}

func (q *MultiPhraseQuery) UnmarshalJSON(data []byte) error {
type _mphraseQuery MultiPhraseQuery
tmp := _mphraseQuery{}
err := util.UnmarshalJSON(data, &tmp)
if err != nil {
type Alias MultiPhraseQuery
aux := &struct {
Fuzziness interface{} `json:"fuzziness"`
*Alias
}{
Alias: (*Alias)(q),
}
if err := util.UnmarshalJSON(data, &aux); err != nil {
return err
}
q.Terms = tmp.Terms
q.FieldVal = tmp.FieldVal
q.BoostVal = tmp.BoostVal
q.Fuzziness = tmp.Fuzziness
switch v := aux.Fuzziness.(type) {
case float64:
q.Fuzziness = int(v)
case string:
if v == "auto" {
q.autoFuzzy = true
}
}
return nil
}

func (f *MultiPhraseQuery) MarshalJSON() ([]byte, error) {
var fuzzyValue interface{}
if f.autoFuzzy {
fuzzyValue = "auto"
} else {
fuzzyValue = f.Fuzziness
}
type multiPhraseQuery struct {
Terms [][]string `json:"terms"`
FieldVal string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness interface{} `json:"fuzziness"`
}
aux := multiPhraseQuery{
Terms: f.Terms,
FieldVal: f.FieldVal,
BoostVal: f.BoostVal,
Fuzziness: fuzzyValue,
}
return util.MarshalJSON(aux)
}
Loading

0 comments on commit 3a21667

Please sign in to comment.