From f701dd977f2cbae569f5a75fafb3e999b908c82e Mon Sep 17 00:00:00 2001 From: "shucai.ljp" Date: Thu, 26 Nov 2015 18:46:26 +0800 Subject: [PATCH] add must not search in indexer.go and add some test cases. --- core/indexer.go | 102 +++++++++++++++++++++++++++++++++++------ core/indexer_test.go | 106 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 13 deletions(-) diff --git a/core/indexer.go b/core/indexer.go index 96dab38..e174459 100644 --- a/core/indexer.go +++ b/core/indexer.go @@ -141,15 +141,15 @@ func (indexer *Indexer) Lookup( } numDocs = 0 - // 合并关键词和标签为搜索键 - keywords := make([]string, len(tokens)+len(labels)) - copy(keywords, tokens) - copy(keywords[len(tokens):], labels) + mustKeywords, mustTokensLength, mustNotKeywords, isValid := getProssedQueries(tokens, labels) + if !isValid { + return + } indexer.tableLock.RLock() defer indexer.tableLock.RUnlock() - table := make([]*KeywordIndices, len(keywords)) - for i, keyword := range keywords { + table := make([]*KeywordIndices, len(mustKeywords)) + for i, keyword := range mustKeywords { indices, found := indexer.tableLock.table[keyword] if !found { // 当反向索引表中无此搜索键时直接返回 @@ -160,6 +160,15 @@ func (indexer *Indexer) Lookup( } } + // 保存must not搜索键 + mustNotTable := make([]*KeywordIndices, 0) + for _, keyword := range mustNotKeywords { + indices, found := indexer.tableLock.table[keyword] + if found { + mustNotTable = append(mustNotTable, indices) + } + } + // 当没有找到时直接返回 if len(table) == 0 { return @@ -171,6 +180,7 @@ func (indexer *Indexer) Lookup( for iTable := 0; iTable < len(table); iTable++ { indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1 } + // 平均文本关键词长度,用于计算BM25 avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments) for ; indexPointers[0] >= 0; indexPointers[0]-- { @@ -186,7 +196,9 @@ func (indexer *Indexer) Lookup( continue } } + iTable := 1 + found := true for ; iTable < len(table); iTable++ { // 二分法比简单的顺序归并效率高,也有更高效率的算法, @@ -196,7 +208,11 @@ func (indexer *Indexer) Lookup( position, foundBaseDocId := indexer.searchIndex(table[iTable], 0, indexPointers[iTable], baseDocId) if foundBaseDocId { - indexPointers[iTable] = position + if !indexer.findInMustNotTable(mustNotTable, baseDocId) { + indexPointers[iTable] = position + } else { + found = false + } } else { if position == 0 { // 该搜索键中所有的文档ID都比baseDocId大,因此已经没有 @@ -211,6 +227,14 @@ func (indexer *Indexer) Lookup( } } + // 如果搜索键只返回一个反向表, 并且存在逻辑非搜索键 + // 则需要判断baseDocId是不是在逻辑非反向表中 + if len(table) == 1 && len(mustNotTable) > 0 { + if indexer.findInMustNotTable(mustNotTable, baseDocId) { + found = false + } + } + if found { indexedDoc := types.IndexedDocument{} @@ -218,12 +242,12 @@ func (indexer *Indexer) Lookup( if indexer.initOptions.IndexType == types.LocationsIndex { // 计算有多少关键词是带有距离信息的 numTokensWithLocations := 0 - for i, t := range table[:len(tokens)] { + for i, t := range table[:mustTokensLength] { if len(t.locations[indexPointers[i]]) > 0 { numTokensWithLocations++ } } - if numTokensWithLocations != len(tokens) { + if numTokensWithLocations != mustTokensLength { if !countDocsOnly { docs = append(docs, types.IndexedDocument{ DocId: baseDocId, @@ -234,13 +258,13 @@ func (indexer *Indexer) Lookup( } // 计算搜索键在文档中的紧邻距离 - tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens) + tokenProximity, tokenLocations := computeTokenProximity(table[:mustTokensLength], indexPointers, mustKeywords[:mustTokensLength]) indexedDoc.TokenProximity = int32(tokenProximity) indexedDoc.TokenSnippetLocations = tokenLocations // 添加TokenLocations - indexedDoc.TokenLocations = make([][]int, len(tokens)) - for i, t := range table[:len(tokens)] { + indexedDoc.TokenLocations = make([][]int, mustTokensLength) + for i, t := range table[:mustTokensLength] { indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]] } } @@ -250,7 +274,7 @@ func (indexer *Indexer) Lookup( indexer.initOptions.IndexType == types.FrequenciesIndex { bm25 := float32(0) d := indexer.docTokenLengths[baseDocId] - for i, t := range table[:len(tokens)] { + for i, t := range table[:mustTokensLength] { var frequency float32 if indexer.initOptions.IndexType == types.LocationsIndex { frequency = float32(len(t.locations[indexPointers[i]])) @@ -423,3 +447,55 @@ func (indexer *Indexer) RemoveDoc(docId uint64) { indexer.numDocuments-- indexer.tableLock.Unlock() } + +func getProssedQueries(tokens []string, labels []string) ( + []string, int, []string, bool) { + mustTokensLength := 0 + mustKeywords := make([]string, 0) + mustNotKeywords := make([]string, 0) + + for _, v := range tokens { + if len(v) > 0 && v[0:1] == "+" { + mustKeywords = append(mustKeywords, v[1:]) + mustTokensLength++ + } + if len(v) > 0 && v[0:1] == "-" { + mustNotKeywords = append(mustNotKeywords, v[1:]) + } + if len(v) > 0 && v[:1] != "+" && v[:1] != "-" { + mustKeywords = append(mustKeywords, v) + mustTokensLength++ + } + } + + for _, v := range labels { + if len(v) > 0 && v[0:1] == "+" { + mustKeywords = append(mustKeywords, v[1:]) + } + if len(v) > 0 && v[0:1] == "-" { + mustNotKeywords = append(mustNotKeywords, v[1:]) + } + if len(v) > 0 && v[:1] != "+" && v[:1] != "-" { + mustKeywords = append(mustKeywords, v) + } + } + + if mustTokensLength == 0 && len(mustNotKeywords) > 0 { + // 不能只包含非搜索键 + return mustKeywords, mustTokensLength, mustNotKeywords, false + } + return mustKeywords, mustTokensLength, mustNotKeywords, true +} + +// 在must not table中查找docId +// 返回: 找到: true, 未找到: false +func (indexer *Indexer) findInMustNotTable(table []*KeywordIndices, docId uint64) bool { + for i := 0; i < len(table); i++ { + _, foundDocId := indexer.searchIndex(table[i], + 0, indexer.getIndexLength(table[i])-1, docId) + if foundDocId { + return true + } + } + return false +} diff --git a/core/indexer_test.go b/core/indexer_test.go index cc02923..28d2754 100644 --- a/core/indexer_test.go +++ b/core/indexer_test.go @@ -370,3 +370,109 @@ func TestLookupWithLocations(t *testing.T) { docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false) utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations) } + +func TestLookupWithMustNot(t *testing.T) { + var indexer Indexer + indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex}) + // doc0 = "token2 token4 token4 token2 token3 token4" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 0, + Keywords: []types.KeywordIndex{ + {"token2", 0, []int{0, 21}}, + {"token3", 0, []int{28}}, + {"token4", 0, []int{7, 14, 35}}, + }, + }) + + docs, num := indexer.Lookup([]string{"+token2", "-token3"}, []string{}, nil, false) + utils.Expect(t, "0", num) + if len(docs) == 0 { + t.Log("Correct!, 0 docs returned.") + } + + _, num = indexer.Lookup([]string{"+token2", "token4", "-token3"}, []string{}, nil, false) + utils.Expect(t, "0", num) + + _, num = indexer.Lookup([]string{"+token2", "-token4", "-token3"}, []string{}, nil, false) + utils.Expect(t, "0", num) +} + +func TestLookupWithMustNotMulti(t *testing.T) { + var indexer Indexer + indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex}) + // doc0 = "token2 token3" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 0, + Keywords: []types.KeywordIndex{ + {"token2", 0, []int{0}}, + {"token3", 0, []int{7}}, + }, + }) + // doc1 = "token1 token2 token3" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 1, + Keywords: []types.KeywordIndex{ + {"token1", 0, []int{0}}, + {"token2", 0, []int{7}}, + {"token3", 0, []int{14}}, + }, + }) + // doc2 = "token1 token2" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 2, + Keywords: []types.KeywordIndex{ + {"token1", 0, []int{0}}, + {"token2", 0, []int{7}}, + }, + }) + // doc3 = "token2" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 3, + Keywords: []types.KeywordIndex{ + {"token2", 0, []int{0}}, + }, + }) + // doc7 = "token1 token3" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 7, + Keywords: []types.KeywordIndex{ + {"token1", 0, []int{0}}, + {"token3", 0, []int{7}}, + }, + }) + // doc9 = "token3" + indexer.AddDocument(&types.DocumentIndex{ + DocId: 9, + Keywords: []types.KeywordIndex{ + {"token3", 0, []int{0}}, + }, + }) + + utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1")) + utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2")) + utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3")) + + utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"+token4"}, []string{}, nil, false))) + + utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ", + indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false))) + utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "+token4"}, []string{}, nil, false))) + + utils.Expect(t, "[2 0 []] [1 0 []] ", + indexedDocsToString(indexer.Lookup([]string{"+token1", "token2"}, []string{}, nil, false))) + + utils.Expect(t, "[2 0 []] [1 0 []] ", + indexedDocsToString(indexer.Lookup([]string{"+token2", "+token1"}, []string{}, nil, false))) + + utils.Expect(t, "[7 0 []] ", + indexedDocsToString(indexer.Lookup([]string{"token1", "-token2"}, []string{}, nil, false))) + + utils.Expect(t, "[3 0 []] [2 0 []] ", + indexedDocsToString(indexer.Lookup([]string{"token2", "-token3"}, []string{}, nil, false))) + + utils.Expect(t, "[3 0 []] ", + indexedDocsToString(indexer.Lookup([]string{"token2", "-token3", "-token1"}, []string{}, nil, false))) + + utils.Expect(t, "", + indexedDocsToString(indexer.Lookup([]string{"-token2", "-token3", "-token1"}, []string{}, nil, false))) +}