Skip to content

Commit

Permalink
add must not search in indexer.go and add some test cases.
Browse files Browse the repository at this point in the history
  • Loading branch information
lijingpeng committed Nov 26, 2015
1 parent e1b46af commit f701dd9
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 13 deletions.
102 changes: 89 additions & 13 deletions core/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,15 @@ func (indexer *Indexer) Lookup(
}
numDocs = 0

// 合并关键词和标签为搜索键
keywords := make([]string, len(tokens)+len(labels))
copy(keywords, tokens)
copy(keywords[len(tokens):], labels)
mustKeywords, mustTokensLength, mustNotKeywords, isValid := getProssedQueries(tokens, labels)
if !isValid {
return
}

indexer.tableLock.RLock()
defer indexer.tableLock.RUnlock()
table := make([]*KeywordIndices, len(keywords))
for i, keyword := range keywords {
table := make([]*KeywordIndices, len(mustKeywords))
for i, keyword := range mustKeywords {
indices, found := indexer.tableLock.table[keyword]
if !found {
// 当反向索引表中无此搜索键时直接返回
Expand All @@ -160,6 +160,15 @@ func (indexer *Indexer) Lookup(
}
}

// 保存must not搜索键
mustNotTable := make([]*KeywordIndices, 0)
for _, keyword := range mustNotKeywords {
indices, found := indexer.tableLock.table[keyword]
if found {
mustNotTable = append(mustNotTable, indices)
}
}

// 当没有找到时直接返回
if len(table) == 0 {
return
Expand All @@ -171,6 +180,7 @@ func (indexer *Indexer) Lookup(
for iTable := 0; iTable < len(table); iTable++ {
indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
}

// 平均文本关键词长度,用于计算BM25
avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
for ; indexPointers[0] >= 0; indexPointers[0]-- {
Expand All @@ -186,7 +196,9 @@ func (indexer *Indexer) Lookup(
continue
}
}

iTable := 1

found := true
for ; iTable < len(table); iTable++ {
// 二分法比简单的顺序归并效率高,也有更高效率的算法,
Expand All @@ -196,7 +208,11 @@ func (indexer *Indexer) Lookup(
position, foundBaseDocId := indexer.searchIndex(table[iTable],
0, indexPointers[iTable], baseDocId)
if foundBaseDocId {
indexPointers[iTable] = position
if !indexer.findInMustNotTable(mustNotTable, baseDocId) {
indexPointers[iTable] = position
} else {
found = false
}
} else {
if position == 0 {
// 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
Expand All @@ -211,19 +227,27 @@ func (indexer *Indexer) Lookup(
}
}

// 如果搜索键只返回一个反向表, 并且存在逻辑非搜索键
// 则需要判断baseDocId是不是在逻辑非反向表中
if len(table) == 1 && len(mustNotTable) > 0 {
if indexer.findInMustNotTable(mustNotTable, baseDocId) {
found = false
}
}

if found {
indexedDoc := types.IndexedDocument{}

// 当为LocationsIndex时计算关键词紧邻距离
if indexer.initOptions.IndexType == types.LocationsIndex {
// 计算有多少关键词是带有距离信息的
numTokensWithLocations := 0
for i, t := range table[:len(tokens)] {
for i, t := range table[:mustTokensLength] {
if len(t.locations[indexPointers[i]]) > 0 {
numTokensWithLocations++
}
}
if numTokensWithLocations != len(tokens) {
if numTokensWithLocations != mustTokensLength {
if !countDocsOnly {
docs = append(docs, types.IndexedDocument{
DocId: baseDocId,
Expand All @@ -234,13 +258,13 @@ func (indexer *Indexer) Lookup(
}

// 计算搜索键在文档中的紧邻距离
tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
tokenProximity, tokenLocations := computeTokenProximity(table[:mustTokensLength], indexPointers, mustKeywords[:mustTokensLength])
indexedDoc.TokenProximity = int32(tokenProximity)
indexedDoc.TokenSnippetLocations = tokenLocations

// 添加TokenLocations
indexedDoc.TokenLocations = make([][]int, len(tokens))
for i, t := range table[:len(tokens)] {
indexedDoc.TokenLocations = make([][]int, mustTokensLength)
for i, t := range table[:mustTokensLength] {
indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
}
}
Expand All @@ -250,7 +274,7 @@ func (indexer *Indexer) Lookup(
indexer.initOptions.IndexType == types.FrequenciesIndex {
bm25 := float32(0)
d := indexer.docTokenLengths[baseDocId]
for i, t := range table[:len(tokens)] {
for i, t := range table[:mustTokensLength] {
var frequency float32
if indexer.initOptions.IndexType == types.LocationsIndex {
frequency = float32(len(t.locations[indexPointers[i]]))
Expand Down Expand Up @@ -423,3 +447,55 @@ func (indexer *Indexer) RemoveDoc(docId uint64) {
indexer.numDocuments--
indexer.tableLock.Unlock()
}

func getProssedQueries(tokens []string, labels []string) (
[]string, int, []string, bool) {
mustTokensLength := 0
mustKeywords := make([]string, 0)
mustNotKeywords := make([]string, 0)

for _, v := range tokens {
if len(v) > 0 && v[0:1] == "+" {
mustKeywords = append(mustKeywords, v[1:])
mustTokensLength++
}
if len(v) > 0 && v[0:1] == "-" {
mustNotKeywords = append(mustNotKeywords, v[1:])
}
if len(v) > 0 && v[:1] != "+" && v[:1] != "-" {
mustKeywords = append(mustKeywords, v)
mustTokensLength++
}
}

for _, v := range labels {
if len(v) > 0 && v[0:1] == "+" {
mustKeywords = append(mustKeywords, v[1:])
}
if len(v) > 0 && v[0:1] == "-" {
mustNotKeywords = append(mustNotKeywords, v[1:])
}
if len(v) > 0 && v[:1] != "+" && v[:1] != "-" {
mustKeywords = append(mustKeywords, v)
}
}

if mustTokensLength == 0 && len(mustNotKeywords) > 0 {
// 不能只包含非搜索键
return mustKeywords, mustTokensLength, mustNotKeywords, false
}
return mustKeywords, mustTokensLength, mustNotKeywords, true
}

// 在must not table中查找docId
// 返回: 找到: true, 未找到: false
func (indexer *Indexer) findInMustNotTable(table []*KeywordIndices, docId uint64) bool {
for i := 0; i < len(table); i++ {
_, foundDocId := indexer.searchIndex(table[i],
0, indexer.getIndexLength(table[i])-1, docId)
if foundDocId {
return true
}
}
return false
}
106 changes: 106 additions & 0 deletions core/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,3 +370,109 @@ func TestLookupWithLocations(t *testing.T) {
docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
}

func TestLookupWithMustNot(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
// doc0 = "token2 token4 token4 token2 token3 token4"
indexer.AddDocument(&types.DocumentIndex{
DocId: 0,
Keywords: []types.KeywordIndex{
{"token2", 0, []int{0, 21}},
{"token3", 0, []int{28}},
{"token4", 0, []int{7, 14, 35}},
},
})

docs, num := indexer.Lookup([]string{"+token2", "-token3"}, []string{}, nil, false)
utils.Expect(t, "0", num)
if len(docs) == 0 {
t.Log("Correct!, 0 docs returned.")
}

_, num = indexer.Lookup([]string{"+token2", "token4", "-token3"}, []string{}, nil, false)
utils.Expect(t, "0", num)

_, num = indexer.Lookup([]string{"+token2", "-token4", "-token3"}, []string{}, nil, false)
utils.Expect(t, "0", num)
}

func TestLookupWithMustNotMulti(t *testing.T) {
var indexer Indexer
indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
// doc0 = "token2 token3"
indexer.AddDocument(&types.DocumentIndex{
DocId: 0,
Keywords: []types.KeywordIndex{
{"token2", 0, []int{0}},
{"token3", 0, []int{7}},
},
})
// doc1 = "token1 token2 token3"
indexer.AddDocument(&types.DocumentIndex{
DocId: 1,
Keywords: []types.KeywordIndex{
{"token1", 0, []int{0}},
{"token2", 0, []int{7}},
{"token3", 0, []int{14}},
},
})
// doc2 = "token1 token2"
indexer.AddDocument(&types.DocumentIndex{
DocId: 2,
Keywords: []types.KeywordIndex{
{"token1", 0, []int{0}},
{"token2", 0, []int{7}},
},
})
// doc3 = "token2"
indexer.AddDocument(&types.DocumentIndex{
DocId: 3,
Keywords: []types.KeywordIndex{
{"token2", 0, []int{0}},
},
})
// doc7 = "token1 token3"
indexer.AddDocument(&types.DocumentIndex{
DocId: 7,
Keywords: []types.KeywordIndex{
{"token1", 0, []int{0}},
{"token3", 0, []int{7}},
},
})
// doc9 = "token3"
indexer.AddDocument(&types.DocumentIndex{
DocId: 9,
Keywords: []types.KeywordIndex{
{"token3", 0, []int{0}},
},
})

utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))

utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"+token4"}, []string{}, nil, false)))

utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "+token4"}, []string{}, nil, false)))

utils.Expect(t, "[2 0 []] [1 0 []] ",
indexedDocsToString(indexer.Lookup([]string{"+token1", "token2"}, []string{}, nil, false)))

utils.Expect(t, "[2 0 []] [1 0 []] ",
indexedDocsToString(indexer.Lookup([]string{"+token2", "+token1"}, []string{}, nil, false)))

utils.Expect(t, "[7 0 []] ",
indexedDocsToString(indexer.Lookup([]string{"token1", "-token2"}, []string{}, nil, false)))

utils.Expect(t, "[3 0 []] [2 0 []] ",
indexedDocsToString(indexer.Lookup([]string{"token2", "-token3"}, []string{}, nil, false)))

utils.Expect(t, "[3 0 []] ",
indexedDocsToString(indexer.Lookup([]string{"token2", "-token3", "-token1"}, []string{}, nil, false)))

utils.Expect(t, "",
indexedDocsToString(indexer.Lookup([]string{"-token2", "-token3", "-token1"}, []string{}, nil, false)))
}

0 comments on commit f701dd9

Please sign in to comment.