diff --git a/analysis/type.go b/analysis/type.go index e3a7f201b..f819984b5 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -106,6 +106,15 @@ type DateTimeParser interface { ParseDateTime(string) (time.Time, string, error) } +const SynonymSourceType = "synonym" + +type SynonymSourceVisitor func(name string, item SynonymSource) error + +type SynonymSource interface { + Analyzer() string + Collection() string +} + type ByteArrayConverter interface { Convert([]byte) (interface{}, error) } diff --git a/document/document.go b/document/document.go index 54fd6d442..0f9591c85 100644 --- a/document/document.go +++ b/document/document.go @@ -48,6 +48,13 @@ func NewDocument(id string) *Document { } } +func NewSynonymDocument(id string) *Document { + return &Document{ + id: id, + Fields: make([]Field, 0), + } +} + func (d *Document) Size() int { sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + len(d.id) @@ -133,3 +140,11 @@ func (d *Document) VisitComposite(visitor index.CompositeFieldVisitor) { func (d *Document) HasComposite() bool { return len(d.CompositeFields) > 0 } + +func (d *Document) VisitSynonymFields(visitor index.SynonymFieldVisitor) { + for _, f := range d.Fields { + if sf, ok := f.(index.SynonymField); ok { + visitor(sf) + } + } +} diff --git a/document/field_synonym.go b/document/field_synonym.go new file mode 100644 index 000000000..4aa603fc4 --- /dev/null +++ b/document/field_synonym.go @@ -0,0 +1,143 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package document + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeSynonymField int + +func init() { + var f SynonymField + reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size()) +} + +const DefaultSynonymIndexingOptions = index.IndexField + +type SynonymField struct { + name string + analyzer analysis.Analyzer + options index.FieldIndexingOptions + input []string + synonyms []string + numPlainTextBytes uint64 + + // populated during analysis + synonymMap map[string][]string +} + +func (s *SynonymField) Size() int { + return reflectStaticSizeSynonymField + size.SizeOfPtr + + len(s.name) +} + +func (s *SynonymField) Name() string { + return s.name +} + +func (s *SynonymField) ArrayPositions() []uint64 { + return nil +} + +func (s *SynonymField) Options() index.FieldIndexingOptions { + return s.options +} + +func (s *SynonymField) NumPlainTextBytes() uint64 { + return s.numPlainTextBytes +} + +func (s *SynonymField) AnalyzedLength() int { + return 0 +} + +func (s *SynonymField) EncodedFieldType() byte { + return 'y' +} + +func (s *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies { + return nil +} + +func (s *SynonymField) Analyze() { + var analyzedInput []string + if len(s.input) > 0 { + analyzedInput = make([]string, 0, len(s.input)) + for _, term := range s.input { + analyzedInput = append(analyzedInput, analyzeSynonymTerm(term, s.analyzer)) + } + } + analyzedSynonyms := make([]string, 0, len(s.synonyms)) + for _, syn := range s.synonyms { + analyzedSynonyms = append(analyzedSynonyms, analyzeSynonymTerm(syn, s.analyzer)) + } + s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms) +} + +func (s *SynonymField) Value() []byte { + return nil +} + +func (s *SynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) { + for term, synonyms := range s.synonymMap { + visitor(term, synonyms) + } +} + +func NewSynonymField(name string, analyzer analysis.Analyzer, input []string, synonyms []string) *SynonymField { + return &SynonymField{ + name: name, + analyzer: analyzer, + options: DefaultSynonymIndexingOptions, + input: input, + synonyms: synonyms, + } +} + +func processSynonymData(input []string, synonyms []string) map[string][]string { + var synonymMap map[string][]string + if len(input) > 0 { + // Map each term to the same list of synonyms. + synonymMap = make(map[string][]string, len(input)) + for _, term := range input { + synonymMap[term] = synonyms + } + } else { + synonymMap = make(map[string][]string, len(synonyms)) + // Precompute a map where each synonym points to all other synonyms. + for i, elem := range synonyms { + synonymMap[elem] = make([]string, 0, len(synonyms)-1) + for j, otherElem := range synonyms { + if i != j { + synonymMap[elem] = append(synonymMap[elem], otherElem) + } + } + } + } + return synonymMap +} + +func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string { + tokenStream := analyzer.Analyze([]byte(term)) + if len(tokenStream) == 0 { + return term + } + return string(tokenStream[0].Term) +} diff --git a/error.go b/error.go index 2d2751cd4..b57a61543 100644 --- a/error.go +++ b/error.go @@ -27,6 +27,7 @@ const ( ErrorEmptyID ErrorIndexReadInconsistency ErrorTwoPhaseSearchInconsistency + ErrorSynonymSearchNotSupported ) // Error represents a more strongly typed bleve error for detecting @@ -49,4 +50,5 @@ var errorMessages = map[Error]string{ ErrorEmptyID: "document ID cannot be empty", ErrorIndexReadInconsistency: "index read inconsistency detected", ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change", + ErrorSynonymSearchNotSupported: "synonym search not supported", } diff --git a/go.mod b/go.mod index 31f8a34bc..9743d6bf0 100644 --- a/go.mod +++ b/go.mod @@ -5,20 +5,20 @@ go 1.21 require ( github.com/RoaringBitmap/roaring v1.9.3 github.com/bits-and-blooms/bitset v1.12.0 - github.com/blevesearch/bleve_index_api v1.1.13 + github.com/blevesearch/bleve_index_api v1.2.0 github.com/blevesearch/geo v0.1.20 github.com/blevesearch/go-faiss v1.0.23 github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/goleveldb v1.0.1 github.com/blevesearch/gtreap v0.1.1 - github.com/blevesearch/scorch_segment_api/v2 v2.2.16 + github.com/blevesearch/scorch_segment_api/v2 v2.3.0 github.com/blevesearch/segment v0.9.1 github.com/blevesearch/snowball v0.6.1 github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/stempel v0.2.0 github.com/blevesearch/upsidedown_store_api v1.0.2 - github.com/blevesearch/vellum v1.0.11 + github.com/blevesearch/vellum v1.1.0 github.com/blevesearch/zapx/v11 v11.3.10 github.com/blevesearch/zapx/v12 v12.3.10 github.com/blevesearch/zapx/v13 v13.3.10 diff --git a/go.sum b/go.sum index 088c6aafb..28bc2b8d0 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4 github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48= -github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo= +github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= github.com/blevesearch/go-faiss v1.0.23 h1:Wmc5AFwDLKGl2L6mjLX1Da3vCL0EKa2uHHSorcIS1Uc= @@ -19,8 +19,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY= -github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0= +github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw= +github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -31,8 +31,8 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= -github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU= -github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= diff --git a/index.go b/index.go index acbefc695..3d2389884 100644 --- a/index.go +++ b/index.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "fmt" "github.com/blevesearch/bleve/v2/index/upsidedown" @@ -63,6 +64,36 @@ func (b *Batch) Index(id string, data interface{}) error { return nil } +func (b *Batch) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + if eventIndex, ok := b.index.(index.EventIndex); ok { + eventIndex.FireIndexEvent() + } + synMap, ok := b.index.Mapping().(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + + return nil +} + func (b *Batch) LastDocSize() uint64 { return b.lastDocSize } @@ -323,3 +354,35 @@ type IndexCopyable interface { // FileSystemDirectory is the default implementation for the // index.Directory interface. type FileSystemDirectory string + +// SynonymDefinition represents a synonym mapping in Bleve. +// Each instance associates one or more input terms with a list of synonyms, +// defining how terms are treated as equivalent in searches. +type SynonymDefinition struct { + // Input is an optional list of terms for unidirectional synonym mapping. + // When terms are specified in Input, they will map to the terms in Synonyms, + // making the relationship unidirectional (each Input maps to all Synonyms). + // If Input is omitted, the relationship is bidirectional among all Synonyms. + Input []string `json:"input,omitempty"` + + // Synonyms is a list of terms that are considered equivalent. + // If Input is specified, each term in Input will map to each term in Synonyms. + // If Input is not specified, the Synonyms list will be treated bidirectionally, + // meaning each term in Synonyms is treated as synonymous with all others. + Synonyms []string `json:"synonyms"` +} + +func (sd *SynonymDefinition) Validate() error { + if len(sd.Synonyms) == 0 { + return fmt.Errorf("synonym definition must have at least one synonym") + } + return nil +} + +// SynonymIndex supports indexing synonym definitions alongside regular documents. +// Synonyms, grouped by collection name, define term relationships for query expansion in searches. +type SynonymIndex interface { + Index + // IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. + IndexSynonym(id string, collection string, definition *SynonymDefinition) error +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 79840a41f..efb85b0f8 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -45,7 +45,7 @@ type asynchSegmentResult struct { index int docs *roaring.Bitmap - postings segment.PostingsList + thesItr segment.ThesaurusIterator err error } @@ -60,6 +60,7 @@ var reflectStaticSizeIndexSnapshot int // exported variable, or at the index level by setting the FieldTFRCacheThreshold // in the kvConfig. var DefaultFieldTFRCacheThreshold uint64 = 10 +var DefaultThesaurusTermReaderCacheThreshold uint64 = 10 func init() { var is interface{} = IndexSnapshot{} @@ -87,8 +88,9 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 - m2 sync.Mutex // Protects the fields that follow. - fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + m2 sync.Mutex // Protects the fields that follow. + fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + thesaurusTermReaders map[string][]*IndexSnapshotThesaurusTermReader // keyed by thesaurus name, recycled thesaurus readers } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -269,17 +271,32 @@ func (is *IndexSnapshot) FieldDictPrefix(field string, func (is *IndexSnapshot) FieldDictRegexp(field string, termRegex string) (index.FieldDict, error) { + fd, _, err := is.FieldDictRegexpAutomaton(field, termRegex) + return fd, err +} + +func (is *IndexSnapshot) FieldDictRegexpAutomaton(field string, + termRegex string) (index.FieldDict, index.RegexAutomaton, error) { + return is.fieldDictRegexp(field, termRegex) +} + +func (is *IndexSnapshot) fieldDictRegexp(field string, + termRegex string) (index.FieldDict, index.RegexAutomaton, error) { // TODO: potential optimization where the literal prefix represents the, // entire regexp, allowing us to use PrefixIterator(prefixTerm)? a, prefixBeg, prefixEnd, err := parseRegexp(termRegex) if err != nil { - return nil, err + return nil, nil, err } - return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { + fd, err := is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(a, prefixBeg, prefixEnd) }, false) + if err != nil { + return nil, nil, err + } + return fd, a, nil } func (is *IndexSnapshot) getLevAutomaton(term string, @@ -294,20 +311,37 @@ func (is *IndexSnapshot) getLevAutomaton(term string, func (is *IndexSnapshot) FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (index.FieldDict, error) { + fd, _, err := is.FieldDictFuzzyAutomaton(field, term, fuzziness, prefix) + return fd, err +} + +func (is *IndexSnapshot) FieldDictFuzzyAutomaton(field string, + term string, fuzziness int, prefix string) (index.FieldDict, index.FuzzyAutomaton, error) { + return is.fieldDictFuzzy(field, term, fuzziness, prefix) +} + +func (is *IndexSnapshot) fieldDictFuzzy(field string, + term string, fuzziness int, prefix string) (index.FieldDict, index.FuzzyAutomaton, error) { a, err := is.getLevAutomaton(term, uint8(fuzziness)) if err != nil { - return nil, err + return nil, nil, err + } + var fa index.FuzzyAutomaton + if vfa, ok := a.(vellum.FuzzyAutomaton); ok { + fa = vfa } - var prefixBeg, prefixEnd []byte if prefix != "" { prefixBeg = []byte(prefix) prefixEnd = calculateExclusiveEndFromPrefix(prefixBeg) } - - return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { + fd, err := is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(a, prefixBeg, prefixEnd) }, false) + if err != nil { + return nil, nil, err + } + return fd, fa, nil } func (is *IndexSnapshot) FieldDictContains(field string) (index.FieldDictContains, error) { @@ -649,6 +683,15 @@ func (is *IndexSnapshot) getFieldTFRCacheThreshold() uint64 { return DefaultFieldTFRCacheThreshold } +func (is *IndexSnapshot) getThesaurusTermReaderCacheThreshold() uint64 { + if is.parent.config != nil { + if _, ok := is.parent.config["ThesaurusTermReaderCacheThreshold"]; ok { + return is.parent.config["ThesaurusTermReaderCacheThreshold"].(uint64) + } + } + return DefaultThesaurusTermReaderCacheThreshold +} + func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { if !tfr.recycle { // Do not recycle an optimized unadorned term field reader (used for @@ -677,6 +720,25 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade is.m2.Unlock() } +func (is *IndexSnapshot) recycleThesaurusTermReader(str *IndexSnapshotThesaurusTermReader) { + is.parent.rootLock.RLock() + obsolete := is.parent.root != is + is.parent.rootLock.RUnlock() + if obsolete { + // if we're not the current root (mutations happened), don't bother recycling + return + } + + is.m2.Lock() + if is.thesaurusTermReaders == nil { + is.thesaurusTermReaders = map[string][]*IndexSnapshotThesaurusTermReader{} + } + if uint64(len(is.thesaurusTermReaders[str.name])) < is.getThesaurusTermReaderCacheThreshold() { + is.thesaurusTermReaders[str.name] = append(is.thesaurusTermReaders[str.name], str) + } + is.m2.Unlock() +} + func docNumberToBytes(buf []byte, in uint64) []byte { if len(buf) != 8 { if cap(buf) >= 8 { @@ -956,3 +1018,155 @@ func (is *IndexSnapshot) CloseCopyReader() error { // close the index snapshot normally return is.Close() } + +func (is *IndexSnapshot) allocThesaurusTermReader(name string) (str *IndexSnapshotThesaurusTermReader) { + is.m2.Lock() + if is.thesaurusTermReaders != nil { + strs := is.thesaurusTermReaders[name] + last := len(strs) - 1 + if last >= 0 { + str = strs[last] + strs[last] = nil + is.thesaurusTermReaders[name] = strs[:last] + is.m2.Unlock() + return + } + } + is.m2.Unlock() + return &IndexSnapshotThesaurusTermReader{} +} + +func (is *IndexSnapshot) ThesaurusTermReader(ctx context.Context, thesaurusName string, term []byte) (index.ThesaurusTermReader, error) { + rv := is.allocThesaurusTermReader(thesaurusName) + + rv.name = thesaurusName + rv.snapshot = is + if rv.postings == nil { + rv.postings = make([]segment.SynonymsList, len(is.segment)) + } + if rv.iterators == nil { + rv.iterators = make([]segment.SynonymsIterator, len(is.segment)) + } + rv.segmentOffset = 0 + + if rv.thesauri == nil { + rv.thesauri = make([]segment.Thesaurus, len(is.segment)) + for i, s := range is.segment { + if synSeg, ok := s.segment.(segment.ThesaurusSegment); ok { + thes, err := synSeg.Thesaurus(thesaurusName) + if err != nil { + return nil, err + } + rv.thesauri[i] = thes + } + } + } + + for i, s := range is.segment { + if _, ok := s.segment.(segment.ThesaurusSegment); ok { + pl, err := rv.thesauri[i].SynonymsList(term, s.deleted, rv.postings[i]) + if err != nil { + return nil, err + } + rv.postings[i] = pl + + rv.iterators[i] = pl.Iterator(rv.iterators[i]) + } + } + return rv, nil +} + +func (is *IndexSnapshot) newIndexSnapshotThesaurusKeys(name string, + makeItr func(i segment.Thesaurus) segment.ThesaurusIterator) (*IndexSnapshotThesaurusKeys, error) { + + results := make(chan *asynchSegmentResult, len(is.segment)) + var wg sync.WaitGroup + wg.Add(len(is.segment)) + for _, s := range is.segment { + go func(s *SegmentSnapshot) { + defer wg.Done() + if synSeg, ok := s.segment.(segment.ThesaurusSegment); ok { + thes, err := synSeg.Thesaurus(name) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{thesItr: makeItr(thes)} + } + } + }(s) + } + // Close the channel after all goroutines complete + go func() { + wg.Wait() + close(results) + }() + + var err error + rv := &IndexSnapshotThesaurusKeys{ + snapshot: is, + cursors: make([]*segmentThesCursor, 0, len(is.segment)), + } + for asr := range results { + if asr.err != nil && err == nil { + err = asr.err + } else { + next, err2 := asr.thesItr.Next() + if err2 != nil && err == nil { + err = err2 + } + if next != nil { + rv.cursors = append(rv.cursors, &segmentThesCursor{ + itr: asr.thesItr, + curr: *next, + }) + } + } + } + // after ensuring we've read all items on channel + if err != nil { + return nil, err + } + + return rv, nil +} + +func (is *IndexSnapshot) ThesaurusKeys(name string) (index.ThesaurusKeys, error) { + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(nil, nil, nil) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysFuzzy(name string, + term string, fuzziness int, prefix string) (index.ThesaurusKeys, error) { + a, err := is.getLevAutomaton(term, uint8(fuzziness)) + if err != nil { + return nil, err + } + var prefixBeg, prefixEnd []byte + if prefix != "" { + prefixBeg = []byte(prefix) + prefixEnd = calculateExclusiveEndFromPrefix(prefixBeg) + } + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysPrefix(name string, + termPrefix []byte) (index.ThesaurusKeys, error) { + termPrefixEnd := calculateExclusiveEndFromPrefix(termPrefix) + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(nil, termPrefix, termPrefixEnd) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysRegexp(name string, + termRegex string) (index.ThesaurusKeys, error) { + a, prefixBeg, prefixEnd, err := parseRegexp(termRegex) + if err != nil { + return nil, err + } + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} diff --git a/index/scorch/snapshot_index_str.go b/index/scorch/snapshot_index_str.go new file mode 100644 index 000000000..c66fbeec5 --- /dev/null +++ b/index/scorch/snapshot_index_str.go @@ -0,0 +1,78 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/size" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeIndexSnapshotThesaurusTermReader int + +func init() { + var istr IndexSnapshotThesaurusTermReader + reflectStaticSizeIndexSnapshotThesaurusTermReader = int(reflect.TypeOf(istr).Size()) +} + +type IndexSnapshotThesaurusTermReader struct { + name string + snapshot *IndexSnapshot + thesauri []segment.Thesaurus + postings []segment.SynonymsList + iterators []segment.SynonymsIterator + segmentOffset int +} + +func (i *IndexSnapshotThesaurusTermReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotThesaurusTermReader + size.SizeOfPtr + + len(i.name) + size.SizeOfString + + for _, postings := range i.postings { + sizeInBytes += postings.Size() + } + + for _, iterator := range i.iterators { + sizeInBytes += iterator.Size() + } + + return sizeInBytes +} + +func (i *IndexSnapshotThesaurusTermReader) Next() (string, error) { + // find the next hit + for i.segmentOffset < len(i.iterators) { + if i.iterators[i.segmentOffset] != nil { + next, err := i.iterators[i.segmentOffset].Next() + if err != nil { + return "", err + } + if next != nil { + synTerm := next.Term() + return synTerm, nil + } + i.segmentOffset++ + } + } + return "", nil +} + +func (i *IndexSnapshotThesaurusTermReader) Close() error { + if i.snapshot != nil { + i.snapshot.recycleThesaurusTermReader(i) + } + return nil +} diff --git a/index/scorch/snapshot_index_thes.go b/index/scorch/snapshot_index_thes.go new file mode 100644 index 000000000..6f3aae818 --- /dev/null +++ b/index/scorch/snapshot_index_thes.go @@ -0,0 +1,107 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "container/heap" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +type segmentThesCursor struct { + thes segment.Thesaurus + itr segment.ThesaurusIterator + curr index.ThesaurusEntry +} + +type IndexSnapshotThesaurusKeys struct { + snapshot *IndexSnapshot + cursors []*segmentThesCursor + entry index.ThesaurusEntry +} + +func (i *IndexSnapshotThesaurusKeys) Len() int { return len(i.cursors) } +func (i *IndexSnapshotThesaurusKeys) Less(a, b int) bool { + return i.cursors[a].curr.Term < i.cursors[b].curr.Term +} +func (i *IndexSnapshotThesaurusKeys) Swap(a, b int) { + i.cursors[a], i.cursors[b] = i.cursors[b], i.cursors[a] +} + +func (i *IndexSnapshotThesaurusKeys) Push(x interface{}) { + i.cursors = append(i.cursors, x.(*segmentThesCursor)) +} + +func (i *IndexSnapshotThesaurusKeys) Pop() interface{} { + n := len(i.cursors) + x := i.cursors[n-1] + i.cursors = i.cursors[0 : n-1] + return x +} + +func (i *IndexSnapshotThesaurusKeys) Next() (*index.ThesaurusEntry, error) { + if len(i.cursors) == 0 { + return nil, nil + } + i.entry = i.cursors[0].curr + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = *next + heap.Fix(i, 0) + } + // look for any other entries with the exact same term + for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = *next + heap.Fix(i, 0) + } + } + + return &i.entry, nil +} + +func (i *IndexSnapshotThesaurusKeys) Close() error { + return nil +} + +func (i *IndexSnapshotThesaurusKeys) Contains(key []byte) (bool, error) { + if len(i.cursors) == 0 { + return false, nil + } + + for _, cursor := range i.cursors { + if found, _ := cursor.thes.Contains(key); found { + return true, nil + } + } + + return false, nil +} diff --git a/index_alias_impl.go b/index_alias_impl.go index 3c7cdcd32..eee8243fb 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -31,6 +31,10 @@ type indexAliasImpl struct { indexes []Index mutex sync.RWMutex open bool + // if all the indexes in tha alias have the same mapping + // then the user can set the mapping here to avoid + // checking the mapping of each index in the alias + mapping mapping.IndexMapping } // NewIndexAlias creates a new IndexAlias over the provided @@ -78,6 +82,25 @@ func (i *indexAliasImpl) Index(id string, data interface{}) error { return i.indexes[0].Index(id, data) } +func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + + if si, ok := i.indexes[0].(SynonymIndex); ok { + return si.IndexSynonym(id, collection, definition) + } + return ErrorSynonymSearchNotSupported +} + func (i *indexAliasImpl) Delete(id string) error { i.mutex.RLock() defer i.mutex.RUnlock() @@ -168,7 +191,11 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // indicates that this index alias is set as an Index // in another alias, so we need to do a preSearch search // and NOT a real search - return preSearchDataSearch(ctx, req, i.indexes...) + flags := &preSearchFlags{ + knn: requestHasKNN(req), + synonyms: !isMatchNoneQuery(req.Query), + } + return preSearchDataSearch(ctx, req, flags, i.indexes...) } // at this point we know we are doing a real search @@ -182,12 +209,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // if necessary var preSearchData map[string]map[string]interface{} if req.PreSearchData != nil { - if requestHasKNN(req) { - var err error - preSearchData, err = redistributeKNNPreSearchData(req, i.indexes) - if err != nil { - return nil, err - } + var err error + preSearchData, err = redistributePreSearchData(req, i.indexes) + if err != nil { + return nil, err } } @@ -208,9 +233,13 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // - the request requires preSearch var preSearchDuration time.Duration var sr *SearchResult - if req.PreSearchData == nil && preSearchRequired(req) { + flags, err := preSearchRequired(req, i.mapping) + if err != nil { + return nil, err + } + if req.PreSearchData == nil && flags != nil { searchStart := time.Now() - preSearchResult, err := preSearch(ctx, req, i.indexes...) + preSearchResult, err := preSearch(ctx, req, flags, i.indexes...) if err != nil { return nil, err } @@ -221,17 +250,17 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest return preSearchResult, nil } // finalize the preSearch result now - finalizePreSearchResult(req, preSearchResult) + finalizePreSearchResult(req, flags, preSearchResult) // if there are no errors, then merge the data in the preSearch result // and construct the preSearchData to be used in the actual search // if the request is satisfied by the preSearch result, then we can // directly return the preSearch result as the final result - if requestSatisfiedByPreSearch(req) { + if requestSatisfiedByPreSearch(req, flags) { sr = finalizeSearchResult(req, preSearchResult) // no need to run the 2nd phase MultiSearch(..) } else { - preSearchData, err = constructPreSearchData(req, preSearchResult, i.indexes) + preSearchData, err = constructPreSearchData(req, flags, preSearchResult, i.indexes) if err != nil { return nil, err } @@ -352,6 +381,20 @@ func (i *indexAliasImpl) Close() error { return nil } +// SetIndexMapping sets the mapping for the alias and must be used +// ONLY when all the indexes in the alias have the same mapping. +// This is to avoid checking the mapping of each index in the alias +// when executing a search request. +func (i *indexAliasImpl) SetIndexMapping(m mapping.IndexMapping) error { + i.mutex.Lock() + defer i.mutex.Unlock() + if !i.open { + return ErrorIndexClosed + } + i.mapping = m + return nil +} + func (i *indexAliasImpl) Mapping() mapping.IndexMapping { i.mutex.RLock() defer i.mutex.RUnlock() @@ -360,6 +403,11 @@ func (i *indexAliasImpl) Mapping() mapping.IndexMapping { return nil } + // if the mapping is already set, return it + if i.mapping != nil { + return i.mapping + } + err := i.isAliasToSingleIndex() if err != nil { return nil @@ -520,21 +568,59 @@ type asyncSearchResult struct { Err error } -func preSearchRequired(req *SearchRequest) bool { - return requestHasKNN(req) +// preSearchFlags is a struct to hold flags indicating why preSearch is required +type preSearchFlags struct { + knn bool + synonyms bool +} + +// preSearchRequired checks if preSearch is required and returns a boolean flag +// It only allocates the preSearchFlags struct if necessary +func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) (*preSearchFlags, error) { + // Check for KNN query + knn := requestHasKNN(req) + var synonyms bool + if !isMatchNoneQuery(req.Query) { + // Check if synonyms are defined in the mapping + if sm, ok := m.(mapping.SynonymMapping); ok && sm.SynonymCount() > 0 { + // check if any of the fields queried have a synonym source + // in the index mapping, to prevent unnecessary preSearch + fs, err := query.ExtractFields(req.Query, m, nil) + if err != nil { + return nil, err + } + for field := range fs { + if sm.SynonymSourceForPath(field) != "" { + synonyms = true + break + } + } + } + } + if knn || synonyms { + return &preSearchFlags{ + knn: knn, + synonyms: synonyms, + }, nil + } + return nil, nil } -func preSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { - // create a dummy request with a match none query - // since we only care about the preSearchData in PreSearch +func preSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { + var dummyQuery = req.Query + if !flags.synonyms { + // create a dummy request with a match none query + // since we only care about the preSearchData in PreSearch + dummyQuery = query.NewMatchNoneQuery() + } dummyRequest := &SearchRequest{ - Query: query.NewMatchNoneQuery(), + Query: dummyQuery, } newCtx := context.WithValue(ctx, search.PreSearchKey, true) - if requestHasKNN(req) { + if flags.knn { addKnnToDummyRequest(dummyRequest, req) } - return preSearchDataSearch(newCtx, dummyRequest, indexes...) + return preSearchDataSearch(newCtx, dummyRequest, flags, indexes...) } // if the request is satisfied by just the preSearch result, @@ -585,29 +671,77 @@ func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *Se return preSearchResult } -func requestSatisfiedByPreSearch(req *SearchRequest) bool { - if requestHasKNN(req) && isKNNrequestSatisfiedByPreSearch(req) { +func requestSatisfiedByPreSearch(req *SearchRequest, flags *preSearchFlags) bool { + // if the synonyms presearch flag is set the request can never be satisfied by + // the preSearch result as synonyms are not part of the preSearch result + if flags.synonyms { + return false + } + if flags.knn && isKNNrequestSatisfiedByPreSearch(req) { return true } return false } -func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { +func constructSynonymPreSearchData(rv map[string]map[string]interface{}, sr *SearchResult, indexes []Index) map[string]map[string]interface{} { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = sr.SynonymResult + } + return rv +} + +func constructPreSearchData(req *SearchRequest, flags *preSearchFlags, + preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { mergedOut := make(map[string]map[string]interface{}, len(indexes)) for _, index := range indexes { mergedOut[index.Name()] = make(map[string]interface{}) } var err error - if requestHasKNN(req) { + if flags.knn { mergedOut, err = constructKnnPreSearchData(mergedOut, preSearchResult, indexes) if err != nil { return nil, err } } + if flags.synonyms { + mergedOut = constructSynonymPreSearchData(mergedOut, preSearchResult, indexes) + } return mergedOut, nil } -func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +// redistributePreSearchData redistributes the preSearchData sent in the search request to an index alias +// which would happen in the case of an alias tree and depending on the level of the tree, the preSearchData +// needs to be redistributed to the indexes at that level +func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + rv := make(map[string]map[string]interface{}) + for _, index := range indexes { + rv[index.Name()] = make(map[string]interface{}) + } + if knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch); ok { + // the preSearchData for KNN is a list of DocumentMatch objects + // that need to be redistributed to the right index. + // This is used only in the case of an alias tree, where the indexes + // are at the leaves of the tree, and the master alias is at the root. + // At each level of the tree, the preSearchData needs to be redistributed + // to the indexes/aliases at that level. Because the preSearchData is + // specific to each final index at the leaf. + segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) + if err != nil { + return nil, err + } + for _, index := range indexes { + rv[index.Name()][search.KnnPreSearchDataKey] = segregatedKnnHits[index.Name()] + } + } + if fts, ok := req.PreSearchData[search.SynonymPreSearchDataKey].(search.FieldTermSynonymMap); ok { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = fts + } + } + return rv, nil +} + +func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { asyncResults := make(chan *asyncSearchResult, len(indexes)) // run search on each index in separate go routine var waitGroup sync.WaitGroup @@ -638,7 +772,7 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind if prp == nil { // first valid preSearch result // create a new preSearch result processor - prp = createPreSearchResultProcessor(req) + prp = createPreSearchResultProcessor(req, flags) } prp.add(asr.Result, asr.Name) if sr == nil { @@ -684,6 +818,14 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind return sr, nil } +// finalizePreSearchResult finalizes the preSearch result by applying the finalization steps +// specific to the preSearch flags +func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult) { + if flags.knn { + preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) + } +} + // hitsInCurrentPage returns the hits in the current page // using the From and Size parameters in the request func hitsInCurrentPage(req *SearchRequest, hits []*search.DocumentMatch) []*search.DocumentMatch { diff --git a/index_impl.go b/index_impl.go index e6debf17a..289014f6c 100644 --- a/index_impl.go +++ b/index_impl.go @@ -38,6 +38,7 @@ import ( "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/facet" "github.com/blevesearch/bleve/v2/search/highlight" + "github.com/blevesearch/bleve/v2/search/query" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" "github.com/blevesearch/geo/s2" @@ -267,6 +268,40 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) { return } +// IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. +// Synonym definition defines term relationships for query expansion in searches. +func (i *indexImpl) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + i.FireIndexEvent() + + synMap, ok := i.m.(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + err = i.i.Update(doc) + return err +} + // IndexAdvanced takes a document.Document object // skips the mapping and indexes it. func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) { @@ -449,12 +484,25 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in } } + var fts search.FieldTermSynonymMap + if !isMatchNoneQuery(req.Query) { + if synMap, ok := i.m.(mapping.SynonymMapping); ok { + if synReader, ok := reader.(index.ThesaurusReader); ok { + fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) + if err != nil { + return nil, err + } + } + } + } + return &SearchResult{ Status: &SearchStatus{ Total: 1, Successful: 1, }, - Hits: knnHits, + Hits: knnHits, + SynonymResult: fts, }, nil } @@ -505,8 +553,12 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } var knnHits []*search.DocumentMatch + var skipKNNCollector bool + + var fts search.FieldTermSynonymMap + var skipSynonymCollector bool + var ok bool - var skipKnnCollector bool if req.PreSearchData != nil { for k, v := range req.PreSearchData { switch k { @@ -516,20 +568,43 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if !ok { return nil, fmt.Errorf("knn preSearchData must be of type []*search.DocumentMatch") } + skipKNNCollector = true + } + case search.SynonymPreSearchDataKey: + if v != nil { + fts, ok = v.(search.FieldTermSynonymMap) + if !ok { + return nil, fmt.Errorf("synonym preSearchData must be of type search.FieldTermSynonymMap") + } + skipSynonymCollector = true } - skipKnnCollector = true } } } - if !skipKnnCollector && requestHasKNN(req) { + if !skipKNNCollector && requestHasKNN(req) { knnHits, err = i.runKnnCollector(ctx, req, indexReader, false) if err != nil { return nil, err } } + if !skipSynonymCollector { + if synMap, ok := i.m.(mapping.SynonymMapping); ok && synMap.SynonymCount() > 0 { + if synReader, ok := indexReader.(index.ThesaurusReader); ok { + fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) + if err != nil { + return nil, err + } + } + } + } + setKnnHitsInCollector(knnHits, req, coll) + if fts != nil { + ctx = context.WithValue(ctx, search.FieldTermSynonymMapKey, fts) + } + // This callback and variable handles the tracking of bytes read // 1. as part of creation of tfr and its Next() calls which is // accounted by invoking this callback when the TFR is closed. diff --git a/index_test.go b/index_test.go index e75c5fd87..5d801b4e0 100644 --- a/index_test.go +++ b/index_test.go @@ -402,9 +402,9 @@ func TestBytesRead(t *testing.T) { stats, _ := idx.StatsMap()["index"].(map[string]interface{}) prevBytesRead, _ := stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead := uint64(21639) + expectedBytesRead := uint64(22049) if supportForVectorSearch { - expectedBytesRead = 22049 + expectedBytesRead = 22459 } if prevBytesRead != expectedBytesRead && res.Cost == prevBytesRead { @@ -560,9 +560,9 @@ func TestBytesReadStored(t *testing.T) { stats, _ := idx.StatsMap()["index"].(map[string]interface{}) bytesRead, _ := stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead := uint64(11501) + expectedBytesRead := uint64(11911) if supportForVectorSearch { - expectedBytesRead = 11911 + expectedBytesRead = 12321 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { @@ -637,9 +637,9 @@ func TestBytesReadStored(t *testing.T) { stats, _ = idx1.StatsMap()["index"].(map[string]interface{}) bytesRead, _ = stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead = uint64(3687) + expectedBytesRead = uint64(4097) if supportForVectorSearch { - expectedBytesRead = 4097 + expectedBytesRead = 4507 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { diff --git a/mapping/analysis.go b/mapping/analysis.go index 03e3cd01b..311e97232 100644 --- a/mapping/analysis.go +++ b/mapping/analysis.go @@ -21,6 +21,7 @@ type customAnalysis struct { TokenFilters map[string]map[string]interface{} `json:"token_filters,omitempty"` Analyzers map[string]map[string]interface{} `json:"analyzers,omitempty"` DateTimeParsers map[string]map[string]interface{} `json:"date_time_parsers,omitempty"` + SynonymSources map[string]map[string]interface{} `json:"synonym_sources,omitempty"` } func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { @@ -83,6 +84,12 @@ func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { return err } } + for name, config := range c.SynonymSources { + _, err := i.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + } return nil } @@ -94,6 +101,7 @@ func newCustomAnalysis() *customAnalysis { TokenFilters: make(map[string]map[string]interface{}), Analyzers: make(map[string]map[string]interface{}), DateTimeParsers: make(map[string]map[string]interface{}), + SynonymSources: make(map[string]map[string]interface{}), } return &rv } diff --git a/mapping/document.go b/mapping/document.go index 847326e41..e89e66979 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -40,11 +40,12 @@ import ( // are used. To disable this automatic handling, set // Dynamic to false. type DocumentMapping struct { - Enabled bool `json:"enabled"` - Dynamic bool `json:"dynamic"` - Properties map[string]*DocumentMapping `json:"properties,omitempty"` - Fields []*FieldMapping `json:"fields,omitempty"` - DefaultAnalyzer string `json:"default_analyzer,omitempty"` + Enabled bool `json:"enabled"` + Dynamic bool `json:"dynamic"` + Properties map[string]*DocumentMapping `json:"properties,omitempty"` + Fields []*FieldMapping `json:"fields,omitempty"` + DefaultAnalyzer string `json:"default_analyzer,omitempty"` + DefaultSynonymSource string `json:"default_synonym_source,omitempty"` // StructTagKey overrides "json" when looking for field names in struct tags StructTagKey string `json:"struct_tag_key,omitempty"` @@ -59,6 +60,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } + if dm.DefaultSynonymSource != "" { + _, err := cache.SynonymSourceNamed(dm.DefaultSynonymSource) + if err != nil { + return err + } + } for propertyName, property := range dm.Properties { newParent := propertyName if parentName != "" { @@ -82,7 +89,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } - + if field.SynonymSource != "" { + _, err = cache.SynonymSourceNamed(field.SynonymSource) + if err != nil { + return err + } + } err := validateFieldMapping(field, parentName, fieldAliasCtx) if err != nil { return err @@ -112,6 +124,17 @@ func (dm *DocumentMapping) analyzerNameForPath(path string) string { return "" } +// synonymSourceForPath attempts to first find the field +// described by this path, then returns the analyzer +// configured for that field +func (dm *DocumentMapping) synonymSourceForPath(path string) string { + field := dm.fieldDescribedByPath(path) + if field != nil { + return field.SynonymSource + } + return "" +} + func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping { pathElements := decodePath(path) if len(pathElements) > 1 { @@ -295,6 +318,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "default_synonym_source": + err := util.UnmarshalJSON(v, &dm.DefaultSynonymSource) + if err != nil { + return err + } case "properties": err := util.UnmarshalJSON(v, &dm.Properties) if err != nil { @@ -338,6 +366,22 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { return rv } +func (dm *DocumentMapping) defaultSynonymSource(path []string) string { + current := dm + rv := current.DefaultSynonymSource + for _, pathElement := range path { + var ok bool + current, ok = current.Properties[pathElement] + if !ok { + break + } + if current.DefaultSynonymSource != "" { + rv = current.DefaultSynonymSource + } + } + return rv +} + func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { // allow default "json" tag to be overridden structTagKey := dm.StructTagKey diff --git a/mapping/field.go b/mapping/field.go index 5c064fddd..ce2878b18 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -80,6 +80,8 @@ type FieldMapping struct { // Applicable to vector fields only - optimization string VectorIndexOptimizedFor string `json:"vector_index_optimized_for,omitempty"` + + SynonymSource string `json:"synonym_source,omitempty"` } // NewTextFieldMapping returns a default field mapping for text @@ -460,17 +462,22 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { return err } case "dims": - err := json.Unmarshal(v, &fm.Dims) + err := util.UnmarshalJSON(v, &fm.Dims) if err != nil { return err } case "similarity": - err := json.Unmarshal(v, &fm.Similarity) + err := util.UnmarshalJSON(v, &fm.Similarity) if err != nil { return err } case "vector_index_optimized_for": - err := json.Unmarshal(v, &fm.VectorIndexOptimizedFor) + err := util.UnmarshalJSON(v, &fm.VectorIndexOptimizedFor) + if err != nil { + return err + } + case "synonym_source": + err := util.UnmarshalJSON(v, &fm.SynonymSource) if err != nil { return err } diff --git a/mapping/index.go b/mapping/index.go index fe8c96713..8a0d5e34a 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -49,6 +49,7 @@ type IndexMappingImpl struct { DefaultType string `json:"default_type"` DefaultAnalyzer string `json:"default_analyzer"` DefaultDateTimeParser string `json:"default_datetime_parser"` + DefaultSynonymSource string `json:"default_synonym_source,omitempty"` DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` @@ -145,6 +146,15 @@ func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[stri return nil } +func (im *IndexMappingImpl) AddSynonymSource(name string, config map[string]interface{}) error { + _, err := im.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + im.CustomAnalysis.SynonymSources[name] = config + return nil +} + // NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMappingImpl { return &IndexMappingImpl{ @@ -174,7 +184,12 @@ func (im *IndexMappingImpl) Validate() error { if err != nil { return err } - + if im.DefaultSynonymSource != "" { + _, err = im.cache.SynonymSourceNamed(im.DefaultSynonymSource) + if err != nil { + return err + } + } fieldAliasCtx := make(map[string]*FieldMapping) err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { @@ -253,6 +268,11 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "default_synonym_source": + err := util.UnmarshalJSON(v, &im.DefaultSynonymSource) + if err != nil { + return err + } case "default_field": err := util.UnmarshalJSON(v, &im.DefaultField) if err != nil { @@ -339,6 +359,24 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} return nil } +func (im *IndexMappingImpl) MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error { + // determine all the synonym sources with the given collection + // and create a synonym field for each + err := im.SynonymSourceVisitor(func(name string, item analysis.SynonymSource) error { + if item.Collection() == collection { + // create a new field with the name of the synonym source + analyzer := im.AnalyzerNamed(item.Analyzer()) + if analyzer == nil { + return fmt.Errorf("unknown analyzer named: %s", item.Analyzer()) + } + field := document.NewSynonymField(name, analyzer, input, synonyms) + doc.AddField(field) + } + return nil + }) + return err +} + type walkContext struct { doc *document.Document im *IndexMappingImpl @@ -457,3 +495,66 @@ func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping { func (im *IndexMappingImpl) DefaultSearchField() string { return im.DefaultField } + +func (im *IndexMappingImpl) SynonymSourceNamed(name string) analysis.SynonymSource { + syn, err := im.cache.SynonymSourceNamed(name) + if err != nil { + logger.Printf("error using synonym source named: %s", name) + return nil + } + return syn +} + +func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { + // first we look for explicit mapping on the field + for _, docMapping := range im.TypeMapping { + synonymSource := docMapping.synonymSourceForPath(path) + if synonymSource != "" { + return synonymSource + } + } + + // now try the default mapping + pathMapping, _ := im.DefaultMapping.documentMappingForPath(path) + if pathMapping != nil { + if len(pathMapping.Fields) > 0 { + if pathMapping.Fields[0].SynonymSource != "" { + return pathMapping.Fields[0].SynonymSource + } + } + } + + // next we will try default synonym sources for the path + pathDecoded := decodePath(path) + for _, docMapping := range im.TypeMapping { + if docMapping.Enabled { + rv := docMapping.defaultSynonymSource(pathDecoded) + if rv != "" { + return rv + } + } + } + // now the default analyzer for the default mapping + if im.DefaultMapping.Enabled { + rv := im.DefaultMapping.defaultSynonymSource(pathDecoded) + if rv != "" { + return rv + } + } + + return im.DefaultSynonymSource +} + +// SynonymCount() returns the number of synonym sources defined in the mapping +func (im *IndexMappingImpl) SynonymCount() int { + return len(im.CustomAnalysis.SynonymSources) +} + +// SynonymSourceVisitor() allows a visitor to iterate over all synonym sources +func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error { + err := im.cache.SynonymSources.VisitSynonymSources(visitor) + if err != nil { + return err + } + return nil +} diff --git a/mapping/mapping.go b/mapping/mapping.go index cbfc98faa..a6c1591b8 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -58,3 +58,19 @@ type IndexMapping interface { FieldMappingForPath(path string) FieldMapping } + +// A SynonymMapping extends the IndexMapping interface to provide +// additional methods for working with synonyms. +type SynonymMapping interface { + IndexMapping + + MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error + + SynonymSourceForPath(path string) string + + SynonymSourceNamed(name string) analysis.SynonymSource + + SynonymCount() int + + SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error +} diff --git a/mapping/synonym.go b/mapping/synonym.go new file mode 100644 index 000000000..06dee754a --- /dev/null +++ b/mapping/synonym.go @@ -0,0 +1,68 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mapping + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" +) + +type SynonymSource struct { + CollectionName string `json:"collection"` + AnalyzerName string `json:"analyzer"` +} + +func NewSynonymSource(collection, analyzer string) *SynonymSource { + return &SynonymSource{ + CollectionName: collection, + AnalyzerName: analyzer, + } +} + +func (s *SynonymSource) Collection() string { + return s.CollectionName +} + +func (s *SynonymSource) Analyzer() string { + return s.AnalyzerName +} + +func (s *SynonymSource) SetCollection(c string) { + s.CollectionName = c +} + +func (s *SynonymSource) SetAnalyzer(a string) { + s.AnalyzerName = a +} +func SynonymSourceConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.SynonymSource, error) { + collection, ok := config["collection"].(string) + if !ok { + return nil, fmt.Errorf("must specify collection") + } + analyzer, ok := config["analyzer"].(string) + if !ok { + return nil, fmt.Errorf("must specify analyzer") + } + if _, err := cache.AnalyzerNamed(analyzer); err != nil { + return nil, fmt.Errorf("analyzer named '%s' not found", analyzer) + } + return NewSynonymSource(collection, analyzer), nil +} + +func init() { + registry.RegisterSynonymSource(analysis.SynonymSourceType, SynonymSourceConstructor) +} diff --git a/pre_search.go b/pre_search.go index c8c55bfbc..90d1293bf 100644 --- a/pre_search.go +++ b/pre_search.go @@ -14,6 +14,10 @@ package bleve +import ( + "github.com/blevesearch/bleve/v2/search" +) + // A preSearchResultProcessor processes the data in // the preSearch result from multiple // indexes in an alias and merges them together to @@ -26,6 +30,8 @@ type preSearchResultProcessor interface { finalize(*SearchResult) } +// ----------------------------------------------------------------------------- +// KNN preSearchResultProcessor for handling KNN presearch results type knnPreSearchResultProcessor struct { addFn func(sr *SearchResult, indexName string) finalizeFn func(sr *SearchResult) @@ -44,16 +50,84 @@ func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) { } // ----------------------------------------------------------------------------- +// Synonym preSearchResultProcessor for handling Synonym presearch results +type synonymPreSearchResultProcessor struct { + finalizedFts search.FieldTermSynonymMap +} + +func newSynonymPreSearchResultProcessor() *synonymPreSearchResultProcessor { + return &synonymPreSearchResultProcessor{} +} + +func (s *synonymPreSearchResultProcessor) add(sr *SearchResult, indexName string) { + // Check if SynonymResult or the synonym data key is nil + if sr.SynonymResult == nil { + return + } + + // Attempt to cast PreSearchResults to FieldTermSynonymMap + + // Merge with finalizedFts or initialize it if nil + if s.finalizedFts == nil { + s.finalizedFts = sr.SynonymResult + } else { + s.finalizedFts.MergeWith(sr.SynonymResult) + } +} -func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { - if requestHasKNN(req) { - preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) +func (s *synonymPreSearchResultProcessor) finalize(sr *SearchResult) { + // Set the finalized synonym data to the PreSearchResults + if s.finalizedFts != nil { + sr.SynonymResult = s.finalizedFts } } -func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor { - if requestHasKNN(req) { - return newKnnPreSearchResultProcessor(req) +// ----------------------------------------------------------------------------- +// Master struct that can hold any number of presearch result processors +type compositePreSearchResultProcessor struct { + presearchResultProcessors []preSearchResultProcessor +} + +// Implements the add method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) add(sr *SearchResult, indexName string) { + for _, p := range m.presearchResultProcessors { + p.add(sr, indexName) + } +} + +// Implements the finalize method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) finalize(sr *SearchResult) { + for _, p := range m.presearchResultProcessors { + p.finalize(sr) + } +} + +// ----------------------------------------------------------------------------- +// Function to create the appropriate preSearchResultProcessor(s) +func createPreSearchResultProcessor(req *SearchRequest, flags *preSearchFlags) preSearchResultProcessor { + var processors []preSearchResultProcessor + // Add KNN processor if the request has KNN + if flags.knn { + if knnProcessor := newKnnPreSearchResultProcessor(req); knnProcessor != nil { + processors = append(processors, knnProcessor) + } + } + // Add Synonym processor if the request has Synonym + if flags.synonyms { + if synonymProcessor := newSynonymPreSearchResultProcessor(); synonymProcessor != nil { + processors = append(processors, synonymProcessor) + } + } + // Return based on the number of processors, optimizing for the common case of 1 processor + // If there are no processors, return nil + switch len(processors) { + case 0: + return nil + case 1: + return processors[0] + default: + return &compositePreSearchResultProcessor{ + presearchResultProcessors: processors, + } } - return &knnPreSearchResultProcessor{} // equivalent to nil } diff --git a/registry/registry.go b/registry/registry.go index 1954d0896..69ee8dd86 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -36,6 +36,7 @@ var tokenMaps = make(TokenMapRegistry, 0) var tokenFilters = make(TokenFilterRegistry, 0) var analyzers = make(AnalyzerRegistry, 0) var dateTimeParsers = make(DateTimeParserRegistry, 0) +var synonymSources = make(SynonymSourceRegistry, 0) type Cache struct { CharFilters *CharFilterCache @@ -47,6 +48,7 @@ type Cache struct { FragmentFormatters *FragmentFormatterCache Fragmenters *FragmenterCache Highlighters *HighlighterCache + SynonymSources *SynonymSourceCache } func NewCache() *Cache { @@ -60,6 +62,7 @@ func NewCache() *Cache { FragmentFormatters: NewFragmentFormatterCache(), Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), + SynonymSources: NewSynonymSourceCache(), } } @@ -147,6 +150,14 @@ func (c *Cache) DefineDateTimeParser(name string, config map[string]interface{}) return c.DateTimeParsers.DefineDateTimeParser(name, typ, config, c) } +func (c *Cache) SynonymSourceNamed(name string) (analysis.SynonymSource, error) { + return c.SynonymSources.SynonymSourceNamed(name, c) +} + +func (c *Cache) DefineSynonymSource(name string, config map[string]interface{}) (analysis.SynonymSource, error) { + return c.SynonymSources.DefineSynonymSource(name, analysis.SynonymSourceType, config, c) +} + func (c *Cache) FragmentFormatterNamed(name string) (highlight.FragmentFormatter, error) { return c.FragmentFormatters.FragmentFormatterNamed(name, c) } diff --git a/registry/synonym_source.go b/registry/synonym_source.go new file mode 100644 index 000000000..cd26d8f01 --- /dev/null +++ b/registry/synonym_source.go @@ -0,0 +1,85 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" +) + +func RegisterSynonymSource(typ string, constructor SynonymSourceConstructor) { + _, exists := synonymSources[typ] + if exists { + panic(fmt.Errorf("attempted to register duplicate synonym source with type '%s'", typ)) + } + synonymSources[typ] = constructor +} + +type SynonymSourceCache struct { + *ConcurrentCache +} + +func NewSynonymSourceCache() *SynonymSourceCache { + return &SynonymSourceCache{ + NewConcurrentCache(), + } +} + +type SynonymSourceConstructor func(config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) +type SynonymSourceRegistry map[string]SynonymSourceConstructor + +func SynonymSourceBuild(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { + cons, registered := synonymSources[name] + if !registered { + return nil, fmt.Errorf("no synonym source with name '%s' registered", name) + } + synonymSource, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building synonym source: %v", err) + } + return synonymSource, nil +} + +func (c *SynonymSourceCache) SynonymSourceNamed(name string, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.ItemNamed(name, cache, SynonymSourceBuild) + if err != nil { + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) DefineSynonymSource(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.DefineItem(name, typ, config, cache, SynonymSourceBuild) + if err != nil { + if err == ErrAlreadyDefined { + return nil, fmt.Errorf("synonym source named '%s' already defined", name) + } + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) VisitSynonymSources(visitor analysis.SynonymSourceVisitor) error { + c.mutex.RLock() + defer c.mutex.RUnlock() + for k, v := range c.data { + err := visitor(k, v.(analysis.SynonymSource)) + if err != nil { + return err + } + } + return nil +} diff --git a/search.go b/search.go index 7861d24b8..72bfca5e2 100644 --- a/search.go +++ b/search.go @@ -444,6 +444,9 @@ type SearchResult struct { MaxScore float64 `json:"max_score"` Took time.Duration `json:"took"` Facets search.FacetResults `json:"facets"` + // special fields that are applicable only for search + // results that are obtained from a presearch + SynonymResult search.FieldTermSynonymMap `json:"synonym_result,omitempty"` } func (sr *SearchResult) Size() int { @@ -589,3 +592,8 @@ func (r *SearchRequest) SortFunc() func(data sort.Interface) { return sort.Sort } + +func isMatchNoneQuery(q query.Query) bool { + _, ok := q.(*query.MatchNoneQuery) + return ok +} diff --git a/search/levenshtein.go b/search/levenshtein.go index 687608d3f..dadab2521 100644 --- a/search/levenshtein.go +++ b/search/levenshtein.go @@ -68,6 +68,10 @@ func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, ld := int(math.Abs(float64(la - lb))) if ld > max { return max, true, d + } else if la == 0 || lb == 0 { + // if one string of the two strings is empty, then ld is + // the length of the other string and as such is <= max + return ld, false, d } if cap(d) < la+1 { diff --git a/search/levenshtein_test.go b/search/levenshtein_test.go index 651f7803c..ef23980ef 100644 --- a/search/levenshtein_test.go +++ b/search/levenshtein_test.go @@ -69,12 +69,19 @@ func TestLevenshteinDistanceMax(t *testing.T) { exceeded: true, }, { - a: "water", + a: "", b: "water", - max: 1, - dist: 0, + max: 10, + dist: 5, exceeded: false, }, + { + a: "water", + b: "", + max: 3, + dist: 3, + exceeded: true, + }, } for _, test := range tests { diff --git a/search/query/query.go b/search/query/query.go index d263a0e54..86859ae5b 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -20,9 +20,12 @@ import ( "fmt" "io" "log" + "strings" + "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -90,6 +93,18 @@ func ParsePreSearchData(input []byte) (map[string]interface{}, error) { rv = make(map[string]interface{}) } rv[search.KnnPreSearchDataKey] = value + case search.SynonymPreSearchDataKey: + var value search.FieldTermSynonymMap + if v != nil { + err := util.UnmarshalJSON(v, &value) + if err != nil { + return nil, err + } + } + if rv == nil { + rv = make(map[string]interface{}) + } + rv[search.SynonymPreSearchDataKey] = value } } return rv, nil @@ -423,3 +438,334 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { data, err := json.MarshalIndent(q, "", " ") return string(data), err } + +// FieldSet represents a set of queried fields. +type FieldSet map[string]struct{} + +// ExtractFields returns a set of fields referenced by the query. +// The returned set may be nil if the query does not explicitly reference any field +// and the DefaultSearchField is unset in the index mapping. +func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, error) { + if q == nil || m == nil { + return fs, nil + } + var err error + switch q := q.(type) { + case FieldableQuery: + f := q.Field() + if f == "" { + f = m.DefaultSearchField() + } + if f != "" { + if fs == nil { + fs = make(FieldSet) + } + fs[f] = struct{}{} + } + case *QueryStringQuery: + var expandedQuery Query + expandedQuery, err = expandQuery(m, q) + if err == nil { + fs, err = ExtractFields(expandedQuery, m, fs) + } + case *BooleanQuery: + for _, subq := range []Query{q.Must, q.Should, q.MustNot} { + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } + } + case *ConjunctionQuery: + for _, subq := range q.Conjuncts { + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } + } + case *DisjunctionQuery: + for _, subq := range q.Disjuncts { + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } + } + } + return fs, err +} + +const ( + FuzzyMatchType = iota + RegexpMatchType + PrefixMatchType +) + +// ExtractSynonyms extracts synonyms from the query tree and returns a map of +// field-term pairs to their synonyms. The input query tree is traversed and +// for each term query, the synonyms are extracted from the synonym source +// associated with the field. The synonyms are then added to the provided map. +// The map is returned and may be nil if no synonyms were found. +func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.ThesaurusReader, + query Query, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + + if r == nil || m == nil || query == nil { + return rv, nil + } + var err error + resolveFieldAndSource := func(field string) (string, string) { + if field == "" { + field = m.DefaultSearchField() + } + return field, m.SynonymSourceForPath(field) + } + handleAnalyzer := func(analyzerName, field string) (analysis.Analyzer, error) { + if analyzerName == "" { + analyzerName = m.AnalyzerNameForPath(field) + } + analyzer := m.AnalyzerNamed(analyzerName) + if analyzer == nil { + return nil, fmt.Errorf("no analyzer named '%s' registered", analyzerName) + } + return analyzer, nil + } + switch q := query.(type) { + case *BooleanQuery: + rv, err = ExtractSynonyms(ctx, m, r, q.Must, rv) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, q.Should, rv) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, q.MustNot, rv) + if err != nil { + return nil, err + } + case *ConjunctionQuery: + for _, child := range q.Conjuncts { + rv, err = ExtractSynonyms(ctx, m, r, child, rv) + if err != nil { + return nil, err + } + } + case *DisjunctionQuery: + for _, child := range q.Disjuncts { + rv, err = ExtractSynonyms(ctx, m, r, child, rv) + if err != nil { + return nil, err + } + } + case *FuzzyQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + fuzziness := q.Fuzziness + if q.autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(q.Term) + } + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, q.Term, fuzziness, q.Prefix, r, rv) + if err != nil { + return nil, err + } + } + case *MatchQuery, *MatchPhraseQuery: + var analyzerName, matchString, fieldVal string + var fuzziness, prefix int + var autoFuzzy bool + if mq, ok := q.(*MatchQuery); ok { + analyzerName, fieldVal, matchString, fuzziness, prefix, autoFuzzy = mq.Analyzer, mq.FieldVal, mq.Match, mq.Fuzziness, mq.Prefix, mq.autoFuzzy + } else if mpq, ok := q.(*MatchPhraseQuery); ok { + analyzerName, fieldVal, matchString, fuzziness, autoFuzzy = mpq.Analyzer, mpq.FieldVal, mpq.MatchPhrase, mpq.Fuzziness, mpq.autoFuzzy + } + field, source := resolveFieldAndSource(fieldVal) + if source != "" { + analyzer, err := handleAnalyzer(analyzerName, field) + if err != nil { + return nil, err + } + tokens := analyzer.Analyze([]byte(matchString)) + for _, token := range tokens { + if autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(string(token.Term)) + } + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, string(token.Term), fuzziness, prefix, r, rv) + if err != nil { + return nil, err + } + } + } + case *MultiPhraseQuery, *PhraseQuery: + var fieldVal string + var fuzziness int + var autoFuzzy bool + if mpq, ok := q.(*MultiPhraseQuery); ok { + fieldVal, fuzziness, autoFuzzy = mpq.FieldVal, mpq.Fuzziness, mpq.autoFuzzy + } else if pq, ok := q.(*PhraseQuery); ok { + fieldVal, fuzziness, autoFuzzy = pq.FieldVal, pq.Fuzziness, pq.autoFuzzy + } + field, source := resolveFieldAndSource(fieldVal) + if source != "" { + var terms []string + if mpq, ok := q.(*MultiPhraseQuery); ok { + for _, termGroup := range mpq.Terms { + terms = append(terms, termGroup...) + } + } else if pq, ok := q.(*PhraseQuery); ok { + terms = pq.Terms + } + for _, term := range terms { + if autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(term) + } + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, term, fuzziness, 0, r, rv) + if err != nil { + return nil, err + } + } + } + case *PrefixQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTermWithMatchType(ctx, PrefixMatchType, source, field, q.Prefix, 0, 0, r, rv) + if err != nil { + return nil, err + } + } + case *QueryStringQuery: + expanded, err := expandQuery(m, q) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, expanded, rv) + if err != nil { + return nil, err + } + case *TermQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTerm(ctx, source, field, q.Term, r, rv) + if err != nil { + return nil, err + } + } + case *RegexpQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTermWithMatchType(ctx, RegexpMatchType, source, field, strings.TrimPrefix(q.Regexp, "^"), 0, 0, r, rv) + if err != nil { + return nil, err + } + } + case *WildcardQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + rv, err = addSynonymsForTermWithMatchType(ctx, RegexpMatchType, source, field, wildcardRegexpReplacer.Replace(q.Wildcard), 0, 0, r, rv) + if err != nil { + return nil, err + } + } + } + return rv, nil +} + +// addFuzzySynonymsForTerm finds all terms that match the given term with the +// given fuzziness and adds their synonyms to the provided map. +func addSynonymsForTermWithMatchType(ctx context.Context, matchType int, src, field, term string, fuzziness, prefix int, + r index.ThesaurusReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + // Determine the terms based on the match type (fuzzy, prefix, or regexp) + var thesKeys index.ThesaurusKeys + var err error + var terms []string + switch matchType { + case FuzzyMatchType: + // Ensure valid fuzziness + if fuzziness == 0 { + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) + if err != nil { + return nil, err + } + return rv, nil + } + if fuzziness > searcher.MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) + } + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + // Handle fuzzy match + prefixTerm := "" + for i, r := range term { + if i < prefix { + prefixTerm += string(r) + } else { + break + } + } + thesKeys, err = r.ThesaurusKeysFuzzy(src, term, fuzziness, prefixTerm) + case RegexpMatchType: + // Handle regexp match + thesKeys, err = r.ThesaurusKeysRegexp(src, term) + case PrefixMatchType: + // Handle prefix match + thesKeys, err = r.ThesaurusKeysPrefix(src, []byte(term)) + default: + return nil, fmt.Errorf("invalid match type: %d", matchType) + } + if err != nil { + return nil, err + } + defer func() { + if cerr := thesKeys.Close(); cerr != nil && err == nil { + err = cerr + } + }() + // Collect the matching terms + terms = []string{} + tfd, err := thesKeys.Next() + for err == nil && tfd != nil { + terms = append(terms, tfd.Term) + tfd, err = thesKeys.Next() + } + if err != nil { + return nil, err + } + for _, synTerm := range terms { + rv, err = addSynonymsForTerm(ctx, src, field, synTerm, r, rv) + if err != nil { + return nil, err + } + } + return rv, nil +} + +func addSynonymsForTerm(ctx context.Context, src, field, term string, + r index.ThesaurusReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + + termReader, err := r.ThesaurusTermReader(ctx, src, []byte(term)) + if err != nil { + return nil, err + } + defer func() { + if cerr := termReader.Close(); cerr != nil && err == nil { + err = cerr + } + }() + var synonyms []string + synonym, err := termReader.Next() + for err == nil && synonym != "" { + synonyms = append(synonyms, synonym) + synonym, err = termReader.Next() + } + if err != nil { + return nil, err + } + if len(synonyms) > 0 { + if rv == nil { + rv = make(search.FieldTermSynonymMap) + } + if _, exists := rv[field]; !exists { + rv[field] = make(map[string][]string) + } + rv[field][term] = synonyms + } + return rv, nil +} diff --git a/search/query/query_test.go b/search/query/query_test.go index 0028c956b..60c1fa374 100644 --- a/search/query/query_test.go +++ b/search/query/query_test.go @@ -16,6 +16,7 @@ package query import ( "reflect" + "sort" "strings" "testing" "time" @@ -785,3 +786,257 @@ func TestParseEmptyQuery(t *testing.T) { t.Errorf("[2] Expected %#v, got %#v", expect, rv) } } + +func TestExtractFields(t *testing.T) { + testQueries := []struct { + query string + expFields []string + }{ + { + query: `{"term":"water","field":"desc"}`, + expFields: []string{"desc"}, + }, + { + query: `{ + "must": { + "conjuncts": [ + { + "match": "water", + "prefix_length": 0, + "fuzziness": 0 + } + ] + }, + "should": { + "disjuncts": [ + { + "match": "beer", + "prefix_length": 0, + "fuzziness": 0 + } + ], + "min": 0 + }, + "must_not": { + "disjuncts": [ + { + "match": "light", + "prefix_length": 0, + "fuzziness": 0 + } + ], + "min": 0 + } + }`, + expFields: []string{"_all"}, + }, + { + query: `{ + "must": { + "conjuncts": [ + { + "match": "water", + "prefix_length": 0, + "field": "desc", + "fuzziness": 0 + } + ] + }, + "should": { + "disjuncts": [ + { + "match": "beer", + "prefix_length": 0, + "field": "desc", + "fuzziness": 0 + } + ], + "min": 0 + }, + "must_not": { + "disjuncts": [ + { + "match": "light", + "prefix_length": 0, + "field": "genre", + "fuzziness": 0 + } + ], + "min": 0 + } + }`, + expFields: []string{"desc", "genre"}, + }, + { + query: ` + { + "conjuncts": [ + { + "conjuncts": [ + { + "conjuncts": [ + { + "conjuncts": [ + { + "field": "date", + "start": "2002-09-05T08:09:00Z", + "end": "2007-03-01T03:52:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 1260295, + "max": 3917314, + "inclusive_min": true, + "inclusive_max": true + } + ] + }, + { + "conjuncts": [ + { + "field": "date2", + "start": "2004-08-21T18:30:00Z", + "end": "2006-03-24T08:08:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 165449, + "max": 3847517, + "inclusive_min": true, + "inclusive_max": true + } + ] + } + ] + }, + { + "conjuncts": [ + { + "conjuncts": [ + { + "field": "date", + "start": "2004-09-02T22:15:00Z", + "end": "2008-06-22T15:06:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number2", + "min": 876843, + "max": 3363351, + "inclusive_min": true, + "inclusive_max": true + } + ] + }, + { + "conjuncts": [ + { + "field": "date", + "start": "2000-12-03T21:35:00Z", + "end": "2008-02-07T05:00:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 2021479, + "max": 4763404, + "inclusive_min": true, + "inclusive_max": true + } + ] + } + ] + } + ] + }, + { + "conjuncts": [ + { + "conjuncts": [ + { + "field": "date3", + "start": "2000-03-13T07:13:00Z", + "end": "2005-09-19T09:33:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 883125, + "max": 4817433, + "inclusive_min": true, + "inclusive_max": true + } + ] + }, + { + "conjuncts": [ + { + "field": "date", + "start": "2002-08-10T22:42:00Z", + "end": "2008-02-10T23:19:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 896115, + "max": 3897074, + "inclusive_min": true, + "inclusive_max": true + } + ] + } + ] + } + ] + }`, + expFields: []string{"date", "number", "date2", "number2", "date3"}, + }, + { + query: `{ + "query" : "hardworking people" + }`, + expFields: []string{"_all"}, + }, + { + query: `{ + "query" : "text:hardworking people" + }`, + expFields: []string{"text", "_all"}, + }, + { + query: `{ + "query" : "text:\"hardworking people\"" + }`, + expFields: []string{"text"}, + }, + } + + m := mapping.NewIndexMapping() + for i, test := range testQueries { + q, err := ParseQuery([]byte(test.query)) + if err != nil { + t.Fatal(err) + } + fields, err := ExtractFields(q, m, nil) + if err != nil { + t.Fatal(err) + } + var fieldsSlice []string + for k := range fields { + fieldsSlice = append(fieldsSlice, k) + } + sort.Strings(test.expFields) + sort.Strings(fieldsSlice) + if !reflect.DeepEqual(fieldsSlice, test.expFields) { + t.Errorf("Test %d: expected %v, got %v", i, test.expFields, fieldsSlice) + } + } +} diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 6c29f845d..187486efc 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -17,6 +17,7 @@ package searcher import ( "context" "fmt" + "strings" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" @@ -55,9 +56,11 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s // since the fuzzy candidate terms are not collected // for a term search, and the only candidate term is // the term itself - fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) - if fuzzyTermMatches != nil { - fuzzyTermMatches.(map[string][]string)[term] = []string{term} + if ctx != nil { + fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) + if fuzzyTermMatches != nil { + fuzzyTermMatches.(map[string][]string)[term] = []string{term} + } } return NewTermSearcher(ctx, indexReader, term, field, boost, options) } @@ -71,7 +74,7 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s break } } - fuzzyCandidates, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, + fuzzyCandidates, err := findFuzzyCandidateTerms(ctx, indexReader, term, fuzziness, field, prefixTerm) if err != nil { return nil, err @@ -94,12 +97,22 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s fuzzyTermMatches.(map[string][]string)[term] = candidates } } + // check if the candidates are empty or have one term which is the term itself + if len(candidates) == 0 || (len(candidates) == 1 && candidates[0] == term) { + if ctx != nil { + fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) + if fuzzyTermMatches != nil { + fuzzyTermMatches.(map[string][]string)[term] = []string{term} + } + } + return NewTermSearcher(ctx, indexReader, term, field, boost, options) + } return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field, boost, editDistances, options, true) } -func getAutoFuzziness(term string) int { +func GetAutoFuzziness(term string) int { termLength := len(term) if termLength > AutoFuzzinessHighThreshold { return MaxFuzziness @@ -111,7 +124,7 @@ func getAutoFuzziness(term string) int { func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string, prefix int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { - return NewFuzzySearcher(ctx, indexReader, term, prefix, getAutoFuzziness(term), field, boost, options) + return NewFuzzySearcher(ctx, indexReader, term, prefix, GetAutoFuzziness(term), field, boost, options) } type fuzzyCandidates struct { @@ -132,7 +145,7 @@ func reportIOStats(ctx context.Context, bytesRead uint64) { } } -func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, +func findFuzzyCandidateTerms(ctx context.Context, indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) { rv = &fuzzyCandidates{ candidates: make([]string, 0), @@ -143,7 +156,19 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, // the levenshtein automaton based iterator to collect the // candidate terms if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { - fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + termSet := make(map[string]struct{}) + addCandidateTerm := func(term string, editDistance uint8) error { + if _, exists := termSet[term]; !exists { + termSet[term] = struct{}{} + rv.candidates = append(rv.candidates, term) + rv.editDistances = append(rv.editDistances, editDistance) + if tooManyClauses(len(rv.candidates)) { + return tooManyClausesErr(field, len(rv.candidates)) + } + } + return nil + } + fieldDict, a, err := ir.FieldDictFuzzyAutomaton(field, term, fuzziness, prefixTerm) if err != nil { return nil, err } @@ -154,16 +179,38 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, }() tfd, err := fieldDict.Next() for err == nil && tfd != nil { - rv.candidates = append(rv.candidates, tfd.Term) - rv.editDistances = append(rv.editDistances, tfd.EditDistance) - if tooManyClauses(len(rv.candidates)) { - return nil, tooManyClausesErr(field, len(rv.candidates)) + err = addCandidateTerm(tfd.Term, tfd.EditDistance) + if err != nil { + return nil, err } tfd, err = fieldDict.Next() } - + if err != nil { + return nil, err + } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if !strings.HasPrefix(term, prefixTerm) { + continue + } + match, editDistance := a.MatchAndDistance(term) + if match { + err = addCandidateTerm(term, editDistance) + if err != nil { + return nil, err + } + } + } + } + } + } rv.bytesRead = fieldDict.BytesRead() - return rv, err + return rv, nil } var fieldDict index.FieldDict diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index bf24b465a..07675cfad 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -164,6 +164,42 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, } } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if fuzzinessEnabled { + for term, fuzzyTerms := range fuzzyTermMatches { + fuzzySynonymTerms := make([]string, 0, len(fuzzyTerms)) + if s, found := ts[term]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + for _, fuzzyTerm := range fuzzyTerms { + if fuzzyTerm == term { + continue + } + if s, found := ts[fuzzyTerm]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + } + if len(fuzzySynonymTerms) > 0 { + fuzzyTermMatches[term] = append(fuzzyTermMatches[term], fuzzySynonymTerms...) + } + } + } else { + for _, termPos := range terms { + for _, term := range termPos { + if s, found := ts[term]; found { + if fuzzyTermMatches == nil { + fuzzyTermMatches = make(map[string][]string) + } + fuzzyTermMatches[term] = s + } + } + } + } + } + } + } mustSearcher, err := NewConjunctionSearcher(ctx, indexReader, termPositionSearchers, options) if err != nil { // close any searchers already opened @@ -337,6 +373,9 @@ func (s *PhraseSearcher) expandFuzzyMatches(tlm search.TermLocationMap, expanded for term, fuzzyMatches := range s.fuzzyTermMatches { locations := tlm[term] for _, fuzzyMatch := range fuzzyMatches { + if fuzzyMatch == term { + continue + } locations = append(locations, tlm[fuzzyMatch]...) } expandedTlm[term] = locations diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index b88133e31..1afdaee02 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -48,7 +48,7 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, return NewRegexpSearcher(ctx, indexReader, r, field, boost, options) } - fieldDict, err := ir.FieldDictRegexp(field, pattern) + fieldDict, a, err := ir.FieldDictRegexpAutomaton(field, pattern) if err != nil { return nil, err } @@ -58,17 +58,37 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, } }() + var termSet = make(map[string]struct{}) var candidateTerms []string tfd, err := fieldDict.Next() for err == nil && tfd != nil { - candidateTerms = append(candidateTerms, tfd.Term) - tfd, err = fieldDict.Next() + if _, exists := termSet[tfd.Term]; !exists { + termSet[tfd.Term] = struct{}{} + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } } if err != nil { return nil, err } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if a.MatchesRegex(term) { + termSet[term] = struct{}{} + candidateTerms = append(candidateTerms, term) + } + } + } + } + } + return NewMultiTermSearcher(ctx, indexReader, candidateTerms, field, boost, options, true) } diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index cd794ea32..c519d8d51 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -38,14 +38,23 @@ type TermSearcher struct { tfd index.TermFieldDoc } -func NewTermSearcher(ctx context.Context, indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func NewTermSearcher(ctx context.Context, indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { if isTermQuery(ctx) { ctx = context.WithValue(ctx, search.QueryTypeKey, search.Term) } return NewTermSearcherBytes(ctx, indexReader, []byte(term), field, boost, options) } -func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if s, found := ts[string(term)]; found { + return NewSynonymSearcher(ctx, indexReader, term, s, field, boost, options) + } + } + } + } needFreqNorm := options.Score != "none" reader, err := indexReader.TermFieldReader(ctx, term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { @@ -69,6 +78,50 @@ func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermF }, nil } +func NewSynonymSearcher(ctx context.Context, indexReader index.IndexReader, term []byte, synonyms []string, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + createTermSearcher := func(term []byte, boostVal float64) (search.Searcher, error) { + needFreqNorm := options.Score != "none" + reader, err := indexReader.TermFieldReader(ctx, term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) + if err != nil { + return nil, err + } + return newTermSearcherFromReader(indexReader, reader, term, field, boostVal, options) + } + // create a searcher for the term itself + termSearcher, err := createTermSearcher(term, boost) + if err != nil { + return nil, err + } + // constituent searchers of the disjunction + qsearchers := make([]search.Searcher, 0, len(synonyms)+1) + // helper method to close all the searchers we've created + // in case of an error + qsearchersClose := func() { + for _, searcher := range qsearchers { + if searcher != nil { + _ = searcher.Close() + } + } + } + qsearchers = append(qsearchers, termSearcher) + // create a searcher for each synonym + for _, synonym := range synonyms { + synonymSearcher, err := createTermSearcher([]byte(synonym), boost/2.0) + if err != nil { + qsearchersClose() + return nil, err + } + qsearchers = append(qsearchers, synonymSearcher) + } + // create a disjunction searcher + rv, err := NewDisjunctionSearcher(ctx, indexReader, qsearchers, 0, options) + if err != nil { + qsearchersClose() + return nil, err + } + return rv, nil +} + func (s *TermSearcher) Size() int { return reflectStaticSizeTermSearcher + size.SizeOfPtr + s.reader.Size() + diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index dc16e4864..3d98cd28e 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -16,6 +16,7 @@ package searcher import ( "context" + "strings" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" @@ -36,13 +37,17 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p }() var terms []string + var termSet = make(map[string]struct{}) tfd, err := fieldDict.Next() for err == nil && tfd != nil { - terms = append(terms, tfd.Term) - if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(field, len(terms)) + if _, exists := termSet[tfd.Term]; !exists { + termSet[tfd.Term] = struct{}{} + terms = append(terms, tfd.Term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(field, len(terms)) + } + tfd, err = fieldDict.Next() } - tfd, err = fieldDict.Next() } if err != nil { return nil, err @@ -53,5 +58,29 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p search.RecordSearchCost(ctx, search.AddM, fieldDict.BytesRead()) } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if strings.HasPrefix(term, prefix) { + termSet[term] = struct{}{} + terms = append(terms, term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(field, len(terms)) + } + } + } + } + } + } + + // check if the terms are empty or have one term which is the prefix itself + if len(terms) == 0 || (len(terms) == 1 && terms[0] == prefix) { + return NewTermSearcher(ctx, indexReader, prefix, field, boost, options) + } + return NewMultiTermSearcher(ctx, indexReader, terms, field, boost, options, true) } diff --git a/search/util.go b/search/util.go index 6472803d1..9f5a15cac 100644 --- a/search/util.go +++ b/search/util.go @@ -136,6 +136,7 @@ const MinGeoBufPoolSize = 24 type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool const KnnPreSearchDataKey = "_knn_pre_search_data_key" +const SynonymPreSearchDataKey = "_synonym_pre_search_data_key" const PreSearchKey = "_presearch_key" @@ -144,5 +145,23 @@ type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *D type SearcherStartCallbackFn func(size uint64) error type SearcherEndCallbackFn func(size uint64) error +// field -> term -> synonyms +type FieldTermSynonymMap map[string]map[string][]string + +func (f *FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) { + for field, termSynonymMap := range fts { + // Ensure the field exists in the receiver + if _, exists := (*f)[field]; !exists { + (*f)[field] = make(map[string][]string) + } + for term, synonyms := range termSynonymMap { + // Append synonyms + (*f)[field][term] = append((*f)[field][term], synonyms...) + } + } +} + +const FieldTermSynonymMapKey = "_field_term_synonym_map_key" + const SearcherStartCallbackKey = "_searcher_start_callback_key" const SearcherEndCallbackKey = "_searcher_end_callback_key" diff --git a/search_knn.go b/search_knn.go index 309b36593..e5fd595d4 100644 --- a/search_knn.go +++ b/search_knn.go @@ -381,7 +381,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea continue } - if _, ok := filterQ.(*query.MatchNoneQuery); ok { + if isMatchNoneQuery(filterQ) { // Filtering required since no hits are eligible. requiresFiltering[idx] = true // a match none query just means none the documents are eligible @@ -559,7 +559,7 @@ func requestHasKNN(req *SearchRequest) bool { func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { // if req.Query is not match_none => then we need to go to phase 2 // to perform the actual query. - if _, ok := req.Query.(*query.MatchNoneQuery); !ok { + if !isMatchNoneQuery(req.Query) { return false } // req.Query is a match_none query @@ -598,41 +598,6 @@ func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { dummyReq.Sort = realReq.Sort } -// the preSearchData for KNN is a list of DocumentMatch objects -// that need to be redistributed to the right index. -// This is used only in the case of an alias tree, where the indexes -// are at the leaves of the tree, and the master alias is at the root. -// At each level of the tree, the preSearchData needs to be redistributed -// to the indexes/aliases at that level. Because the preSearchData is -// specific to each final index at the leaf. -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { - knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch) - if !ok { - return nil, fmt.Errorf("request does not have knn preSearchData for redistribution") - } - segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) - if err != nil { - return nil, err - } - - rv := make(map[string]map[string]interface{}) - for _, index := range indexes { - rv[index.Name()] = make(map[string]interface{}) - } - - for _, index := range indexes { - for k, v := range req.PreSearchData { - switch k { - case search.KnnPreSearchDataKey: - rv[index.Name()][k] = segregatedKnnHits[index.Name()] - default: - rv[index.Name()][k] = v - } - } - } - return rv, nil -} - func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { kArray := make([]int64, len(req.KNN)) for i, knnReq := range req.KNN { diff --git a/search_no_knn.go b/search_no_knn.go index bb72e15a9..c91980589 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -187,7 +187,7 @@ func requestHasKNN(req *SearchRequest) bool { func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { } -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { +func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) { return nil, nil } diff --git a/search_test.go b/search_test.go index c39a58558..043be4073 100644 --- a/search_test.go +++ b/search_test.go @@ -15,10 +15,13 @@ package bleve import ( + "context" "encoding/json" "fmt" "math" + "math/rand" "reflect" + "sort" "strconv" "strings" "testing" @@ -39,6 +42,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds" "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds" "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds" + "github.com/blevesearch/bleve/v2/analysis/lang/en" "github.com/blevesearch/bleve/v2/analysis/token/length" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/shingle" @@ -3746,3 +3750,628 @@ func TestAutoFuzzy(t *testing.T) { } } } + +func TestThesaurusTermReader(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + synonymCollection := "collection1" + + synonymSourceName := "english" + + analyzer := simple.Name + + synonymSourceConfig := map[string]interface{}{ + "collection": synonymCollection, + "analyzer": analyzer, + } + + textField := mapping.NewTextFieldMapping() + textField.Analyzer = analyzer + textField.SynonymSource = synonymSourceName + + imap := mapping.NewIndexMapping() + imap.DefaultMapping.AddFieldMappingsAt("text", textField) + err := imap.AddSynonymSource(synonymSourceName, synonymSourceConfig) + if err != nil { + t.Fatal(err) + } + err = imap.Validate() + if err != nil { + t.Fatal(err) + } + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc1": { + "text": "quick brown fox eats", + }, + "doc2": { + "text": "fast red wolf jumps", + }, + "doc3": { + "text": "quick red cat runs", + }, + "doc4": { + "text": "speedy brown dog barks", + }, + "doc5": { + "text": "fast green rabbit hops", + }, + } + + batch := idx.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + + synonymDocuments := map[string]*SynonymDefinition{ + "synDoc1": { + Synonyms: []string{"quick", "fast", "speedy"}, + }, + "synDoc2": { + Input: []string{"color", "colour"}, + Synonyms: []string{"red", "green", "blue", "yellow", "brown"}, + }, + "synDoc3": { + Input: []string{"animal", "creature"}, + Synonyms: []string{"fox", "wolf", "cat", "dog", "rabbit"}, + }, + "synDoc4": { + Synonyms: []string{"eats", "jumps", "runs", "barks", "hops"}, + }, + } + + for synName, synDef := range synonymDocuments { + err := batch.IndexSynonym(synName, "collection1", synDef) + if err != nil { + t.Fatal(err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + sco, err := idx.Advanced() + if err != nil { + t.Fatal(err) + } + + reader, err := sco.Reader() + if err != nil { + t.Fatal(err) + } + defer func() { + err = reader.Close() + if err != nil { + t.Fatal(err) + } + }() + + thesReader, ok := reader.(index.ThesaurusReader) + if !ok { + t.Fatal("expected thesaurus reader") + } + + type testStruct struct { + queryTerm string + expectedSynonyms []string + } + + testQueries := []testStruct{ + { + queryTerm: "quick", + expectedSynonyms: []string{"fast", "speedy"}, + }, + { + queryTerm: "red", + expectedSynonyms: []string{}, + }, + { + queryTerm: "color", + expectedSynonyms: []string{"red", "green", "blue", "yellow", "brown"}, + }, + { + queryTerm: "colour", + expectedSynonyms: []string{"red", "green", "blue", "yellow", "brown"}, + }, + { + queryTerm: "animal", + expectedSynonyms: []string{"fox", "wolf", "cat", "dog", "rabbit"}, + }, + { + queryTerm: "creature", + expectedSynonyms: []string{"fox", "wolf", "cat", "dog", "rabbit"}, + }, + { + queryTerm: "fox", + expectedSynonyms: []string{}, + }, + { + queryTerm: "eats", + expectedSynonyms: []string{"jumps", "runs", "barks", "hops"}, + }, + { + queryTerm: "jumps", + expectedSynonyms: []string{"eats", "runs", "barks", "hops"}, + }, + } + + for _, test := range testQueries { + str, err := thesReader.ThesaurusTermReader(context.Background(), synonymSourceName, []byte(test.queryTerm)) + if err != nil { + t.Fatal(err) + } + var gotSynonyms []string + for { + synonym, err := str.Next() + if err != nil { + t.Fatal(err) + } + if synonym == "" { + break + } + gotSynonyms = append(gotSynonyms, string(synonym)) + } + if len(gotSynonyms) != len(test.expectedSynonyms) { + t.Fatalf("expected %d synonyms, got %d", len(test.expectedSynonyms), len(gotSynonyms)) + } + sort.Strings(gotSynonyms) + sort.Strings(test.expectedSynonyms) + for i, syn := range gotSynonyms { + if syn != test.expectedSynonyms[i] { + t.Fatalf("expected synonym %s, got %s", test.expectedSynonyms[i], syn) + } + } + } +} + +func TestSynonymSearchQueries(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + synonymCollection := "collection1" + + synonymSourceName := "english" + + analyzer := en.AnalyzerName + + synonymSourceConfig := map[string]interface{}{ + "collection": synonymCollection, + "analyzer": analyzer, + } + + textField := mapping.NewTextFieldMapping() + textField.Analyzer = analyzer + textField.SynonymSource = synonymSourceName + + imap := mapping.NewIndexMapping() + imap.DefaultMapping.AddFieldMappingsAt("text", textField) + err := imap.AddSynonymSource(synonymSourceName, synonymSourceConfig) + if err != nil { + t.Fatal(err) + } + err = imap.Validate() + if err != nil { + t.Fatal(err) + } + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc1": { + "text": `The hardworking employee consistently strives to exceed expectations. + His industrious nature makes him a valuable asset to any team. + His conscientious attention to detail ensures that projects are completed efficiently and accurately. + He remains persistent even in the face of challenges.`, + }, + "doc2": { + "text": `The tranquil surroundings of the retreat provide a perfect escape from the hustle and bustle of city life. + Guests enjoy the peaceful atmosphere, which is perfect for relaxation and rejuvenation. + The calm environment offers the ideal place to meditate and connect with nature. + Even the most stressed individuals find themselves feeling relaxed and at ease.`, + }, + "doc3": { + "text": `The house was burned down, leaving only a charred shell behind. + The intense heat of the flames caused the walls to warp and the roof to cave in. + The seared remains of the furniture told the story of the blaze. + The incinerated remains left little more than ashes to remember what once was.`, + }, + "doc4": { + "text": `The faithful dog followed its owner everywhere, always loyal and steadfast. + It was devoted to protecting its family, and its reliable nature meant it could always be trusted. + In the face of danger, the dog remained calm, knowing its role was to stay vigilant. + Its trustworthy companionship provided comfort and security.`, + }, + "doc5": { + "text": `The lively market is bustling with activity from morning to night. + The dynamic energy of the crowd fills the air as vendors sell their wares. + Shoppers wander from stall to stall, captivated by the vibrant colors and energetic atmosphere. + This place is alive with movement and life.`, + }, + "doc6": { + "text": `In moments of crisis, bravery shines through. + It takes valor to step forward when others are afraid to act. + Heroes are defined by their guts and nerve, taking risks to protect others. + Boldness in the face of danger is what sets them apart.`, + }, + "doc7": { + "text": `Innovation is the driving force behind progress in every industry. + The company fosters an environment of invention, encouraging creativity at every level. + The focus on novelty and improvement means that ideas are always evolving. + The development of new solutions is at the core of the company's mission.`, + }, + "doc8": { + "text": `The blazing sunset cast a radiant glow over the horizon, painting the sky with hues of red and orange. + The intense heat of the day gave way to a fiery display of color. + As the sun set, the glowing light illuminated the landscape, creating a breathtaking scene. + The fiery sky was a sight to behold.`, + }, + "doc9": { + "text": `The fertile soil of the valley makes it perfect for farming. + The productive land yields abundant crops year after year. + Farmers rely on the rich, fruitful ground to sustain their livelihoods. + The area is known for its plentiful harvests, supporting both local communities and export markets.`, + }, + "doc10": { + "text": `The arid desert is a vast, dry expanse with little water or vegetation. + The barren landscape stretches as far as the eye can see, offering little respite from the scorching sun. + The desolate environment is unforgiving to those who venture too far without preparation. + The parched earth cracks under the heat, creating a harsh, unyielding terrain.`, + }, + "doc11": { + "text": `The fox is known for its cunning and intelligence. + As a predator, it relies on its sharp instincts to outwit its prey. + Its vulpine nature makes it both mysterious and fascinating. + The fox's ability to hunt with precision and stealth is what makes it such a formidable hunter.`, + }, + "doc12": { + "text": `The dog is often considered man's best friend due to its loyal nature. + As a companion, the hound provides both protection and affection. + The puppy quickly becomes a member of the family, always by your side. + Its playful energy and unshakable loyalty make it a beloved pet.`, + }, + "doc13": { + "text": `He worked tirelessly through the night, always persistent in his efforts. + His industrious approach to problem-solving kept the project moving forward. + No matter how difficult the task, he remained focused, always giving his best. + His dedication paid off when the project was completed ahead of schedule.`, + }, + "doc14": { + "text": `The river flowed calmly through the valley, its peaceful current offering a sense of tranquility. + Fishermen relaxed by the banks, enjoying the calm waters that reflected the sky above. + The tranquil nature of the river made it a perfect spot for meditation. + As the day ended, the river's quiet flow brought a sense of peace.`, + }, + "doc15": { + "text": `After the fire, all that was left was the charred remains of what once was. + The seared walls of the house told a tragic story. + The intensity of the blaze had burned everything in its path, leaving only the smoldering wreckage behind. + The incinerated objects could not be salvaged, and the damage was beyond repair.`, + }, + "doc16": { + "text": `The devoted employee always went above and beyond to complete his tasks. + His steadfast commitment to the company made him a valuable team member. + He was reliable, never failing to meet deadlines. + His trustworthiness earned him the respect of his colleagues, and was considered an + ingenious expert in his field.`, + }, + "doc17": { + "text": `The city is vibrant, full of life and energy. + The dynamic pace of the streets reflects the diverse culture of its inhabitants. + People from all walks of life contribute to the energetic atmosphere. + The city's lively spirit can be felt in every corner, from the bustling markets to the lively festivals.`, + }, + "doc18": { + "text": `In a moment of uncertainty, he made a bold decision that would change his life forever. + It took courage and nerve to take the leap, but his bravery paid off. + The guts to face the unknown allowed him to achieve something remarkable. + Being an bright scholar, the skill he demonstrated inspired those around him.`, + }, + "doc19": { + "text": `Innovation is often born from necessity, and the lightbulb is a prime example. + Thomas Edison's invention changed the world, offering a new way to see the night. + The creativity involved in developing such a groundbreaking product sparked a wave of + novelty in the scientific community. This improvement in technology continues to shape the modern world. + He was a clever academic and a smart researcher.`, + }, + "doc20": { + "text": `The fiery volcano erupted with a force that shook the earth. Its radiant lava flowed down the sides, + illuminating the night sky. The intense heat from the eruption could be felt miles away, as the + glowing lava burned everything in its path. The fiery display was both terrifying and mesmerizing.`, + }, + } + + synonymDocuments := map[string]*SynonymDefinition{ + "synDoc1": { + Synonyms: []string{"hardworking", "industrious", "conscientious", "persistent", "focused", "devoted"}, + }, + "synDoc2": { + Synonyms: []string{"tranquil", "peaceful", "calm", "relaxed", "unruffled"}, + }, + "synDoc3": { + Synonyms: []string{"burned", "charred", "seared", "incinerated", "singed"}, + }, + "synDoc4": { + Synonyms: []string{"faithful", "steadfast", "devoted", "reliable", "trustworthy"}, + }, + "synDoc5": { + Synonyms: []string{"lively", "dynamic", "energetic", "vivid", "vibrating"}, + }, + "synDoc6": { + Synonyms: []string{"bravery", "valor", "guts", "nerve", "boldness"}, + }, + "synDoc7": { + Input: []string{"innovation"}, + Synonyms: []string{"invention", "creativity", "novelty", "improvement", "development"}, + }, + "synDoc8": { + Input: []string{"blazing"}, + Synonyms: []string{"intense", "radiant", "burning", "fiery", "glowing"}, + }, + "synDoc9": { + Input: []string{"fertile"}, + Synonyms: []string{"productive", "fruitful", "rich", "abundant", "plentiful"}, + }, + "synDoc10": { + Input: []string{"arid"}, + Synonyms: []string{"dry", "barren", "desolate", "parched", "unfertile"}, + }, + "synDoc11": { + Input: []string{"fox"}, + Synonyms: []string{"vulpine", "canine", "predator", "hunter", "pursuer"}, + }, + "synDoc12": { + Input: []string{"dog"}, + Synonyms: []string{"canine", "hound", "puppy", "pup", "companion"}, + }, + "synDoc13": { + Synonyms: []string{"researcher", "scientist", "scholar", "academic", "expert"}, + }, + "synDoc14": { + Synonyms: []string{"bright", "clever", "ingenious", "sharp", "astute", "smart"}, + }, + } + + // Combine both maps into a slice of map entries (as they both have similar structure) + var combinedDocIDs []string + for id := range synonymDocuments { + combinedDocIDs = append(combinedDocIDs, id) + } + for id := range documents { + combinedDocIDs = append(combinedDocIDs, id) + } + rand.Shuffle(len(combinedDocIDs), func(i, j int) { + combinedDocIDs[i], combinedDocIDs[j] = combinedDocIDs[j], combinedDocIDs[i] + }) + + // Function to create batches of 5 + createDocBatches := func(docs []string, batchSize int) [][]string { + var batches [][]string + for i := 0; i < len(docs); i += batchSize { + end := i + batchSize + if end > len(docs) { + end = len(docs) + } + batches = append(batches, docs[i:end]) + } + return batches + } + // Create batches of 5 documents + var batchSize = 5 + docBatches := createDocBatches(combinedDocIDs, batchSize) + if len(docBatches) == 0 { + t.Fatal("expected batches") + } + totalDocs := 0 + for _, batch := range docBatches { + totalDocs += len(batch) + } + if totalDocs != len(combinedDocIDs) { + t.Fatalf("expected %d documents, got %d", len(combinedDocIDs), totalDocs) + } + + var batches []*Batch + for _, docBatch := range docBatches { + batch := idx.NewBatch() + for _, docID := range docBatch { + if synDef, ok := synonymDocuments[docID]; ok { + err := batch.IndexSynonym(docID, synonymCollection, synDef) + if err != nil { + t.Fatal(err) + } + } else { + err := batch.Index(docID, documents[docID]) + if err != nil { + t.Fatal(err) + } + } + } + batches = append(batches, batch) + } + for _, batch := range batches { + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + } + + type testStruct struct { + query string + expectHits []string + } + + testQueries := []testStruct{ + { + query: `{ + "match": "hardworking employee", + "field": "text" + }`, + expectHits: []string{"doc1", "doc13", "doc16", "doc4", "doc7"}, + }, + { + query: `{ + "match": "Hardwork and industrius efforts bring lovely and tranqual moments, with a glazing blow of valour.", + "field": "text", + "fuzziness": "auto" + }`, + expectHits: []string{ + "doc1", "doc13", "doc14", "doc15", "doc16", + "doc17", "doc18", "doc2", "doc20", "doc3", + "doc4", "doc5", "doc6", "doc7", "doc8", "doc9", + }, + }, + { + query: `{ + "prefix": "in", + "field": "text" + }`, + expectHits: []string{ + "doc1", "doc11", "doc13", "doc15", "doc16", + "doc17", "doc18", "doc19", "doc2", "doc20", + "doc3", "doc4", "doc7", "doc8", + }, + }, + { + query: `{ + "prefix": "vivid", + "field": "text" + }`, + expectHits: []string{ + "doc17", "doc5", + }, + }, + { + query: `{ + "match_phrase": "smart academic", + "field": "text" + }`, + expectHits: []string{"doc16", "doc18", "doc19"}, + }, + { + query: `{ + "match_phrase": "smrat acedemic", + "field": "text", + "fuzziness": "auto" + }`, + expectHits: []string{"doc16", "doc18", "doc19"}, + }, + { + query: `{ + "wildcard": "br*", + "field": "text" + }`, + expectHits: []string{"doc11", "doc14", "doc16", "doc18", "doc19", "doc6", "doc8"}, + }, + } + + runTestQueries := func(idx Index) error { + for _, dtq := range testQueries { + q, err := query.ParseQuery([]byte(dtq.query)) + if err != nil { + return err + } + sr := NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + sr.SortBy([]string{"_id"}) + sr.Fields = []string{"*"} + sr.Size = 30 + sr.Explain = true + res, err := idx.Search(sr) + if err != nil { + return err + } + if len(res.Hits) != len(dtq.expectHits) { + return fmt.Errorf("expected %d hits, got %d", len(dtq.expectHits), len(res.Hits)) + } + // sort the expected hits to match the order of the search results + sort.Strings(dtq.expectHits) + for i, hit := range res.Hits { + if hit.ID != dtq.expectHits[i] { + return fmt.Errorf("expected docID %s, got %s", dtq.expectHits[i], hit.ID) + } + } + } + return nil + } + err = runTestQueries(idx) + if err != nil { + t.Fatal(err) + } + // test with index alias - with 1 batch per index + numIndexes := len(batches) + indexes := make([]Index, numIndexes) + indexesPath := make([]string, numIndexes) + for i := 0; i < numIndexes; i++ { + tmpIndexPath := createTmpIndexPath(t) + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + err = idx.Batch(batches[i]) + if err != nil { + t.Fatal(err) + } + indexes[i] = idx + indexesPath[i] = tmpIndexPath + } + defer func() { + for i := 0; i < numIndexes; i++ { + err = indexes[i].Close() + if err != nil { + t.Fatal(err) + } + cleanupTmpIndexPath(t, indexesPath[i]) + } + }() + alias := NewIndexAlias(indexes...) + alias.SetIndexMapping(imap) + err = runTestQueries(alias) + if err != nil { + t.Fatal(err) + } + // test with multi-level alias now with two index per alias + // and having any extra index being in the final alias + numAliases := numIndexes / 2 + extraIndex := numIndexes % 2 + aliases := make([]IndexAlias, numAliases) + for i := 0; i < numAliases; i++ { + alias := NewIndexAlias(indexes[i*2], indexes[i*2+1]) + aliases[i] = alias + } + if extraIndex > 0 { + aliases[numAliases-1].Add(indexes[numIndexes-1]) + } + alias = NewIndexAlias() + alias.SetIndexMapping(imap) + for i := 0; i < numAliases; i++ { + alias.Add(aliases[i]) + } + err = runTestQueries(alias) + if err != nil { + t.Fatal(err) + } +}