From d956874197bd9bbdf281b1a22f69801391b5fbae Mon Sep 17 00:00:00 2001 From: Hui Chen Date: Mon, 12 Aug 2013 21:54:38 -0700 Subject: [PATCH] =?UTF-8?q?=E5=85=81=E8=AE=B8=E5=9C=A8=E6=82=9F=E7=A9=BA?= =?UTF-8?q?=E5=BC=95=E6=93=8E=E5=A4=96=E9=83=A8=E5=AF=B9=E7=B4=A2=E5=BC=95?= =?UTF-8?q?=E7=9A=84=E6=96=87=E6=A1=A3=E8=BF=9B=E8=A1=8C=E5=88=86=E8=AF=8D?= =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/ranker.go | 4 +-- engine/engine_test.go | 61 ++++++++++++++++++++++++++++++++++-- engine/segmenter_worker.go | 28 ++++++++++++----- types/document_index_data.go | 15 +++++++++ types/search_response.go | 4 +-- 5 files changed, 97 insertions(+), 15 deletions(-) diff --git a/core/ranker.go b/core/ranker.go index 36ec234..63decaa 100644 --- a/core/ranker.go +++ b/core/ranker.go @@ -65,8 +65,8 @@ func (ranker *Ranker) Rank( outputDocs = append(outputDocs, types.ScoredDocument{ DocId: d.DocId, Scores: scores, - TokenSnippetPositions: d.TokenSnippetLocations, - TokenPositions: d.TokenLocations}) + TokenSnippetLocations: d.TokenSnippetLocations, + TokenLocations: d.TokenLocations}) } } diff --git a/engine/engine_test.go b/engine/engine_test.go index bb8b1aa..0aa266c 100644 --- a/engine/engine_test.go +++ b/engine/engine_test.go @@ -76,15 +76,15 @@ func TestEngineIndexDocument(t *testing.T) { utils.Expect(t, "1", outputs.Docs[0].DocId) utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000)) - utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetPositions) + utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations) utils.Expect(t, "4", outputs.Docs[1].DocId) utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000)) - utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetPositions) + utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations) utils.Expect(t, "0", outputs.Docs[2].DocId) utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000)) - utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetPositions) + utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations) } func TestReverseOrder(t *testing.T) { @@ -246,3 +246,58 @@ func TestRemoveDocument(t *testing.T) { utils.Expect(t, "0", outputs.Docs[0].DocId) utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000)) } + +func TestEngineIndexDocumentWithTokens(t *testing.T) { + var engine Engine + engine.Init(types.EngineInitOptions{ + SegmenterDictionaries: "../testdata/test_dict.txt", + DefaultRankOptions: &types.RankOptions{ + OutputOffset: 0, + MaxOutputs: 10, + ScoringCriteria: &RankByTokenProximity{}, + }, + IndexerInitOptions: &types.IndexerInitOptions{ + IndexType: types.LocationsIndex, + }, + }) + + docId := uint64(0) + engine.IndexDocument(docId, types.DocumentIndexData{ + Content: "", + Tokens: []types.TokenData{ + {"中国", []int{0}}, + {"人口", []int{18, 24}}, + }, + Fields: ScoringFields{1, 2, 3}, + }) + docId++ + engine.IndexDocument(docId, types.DocumentIndexData{ + Content: "", + Tokens: []types.TokenData{ + {"中国", []int{0}}, + {"人口", []int{6}}, + }, + Fields: ScoringFields{1, 2, 3}, + }) + docId++ + engine.IndexDocument(docId, types.DocumentIndexData{ + Content: "中国十三亿人口", + Fields: ScoringFields{0, 9, 1}, + }) + + engine.FlushIndex() + + outputs := engine.Search(types.SearchRequest{Text: "中国人口"}) + utils.Expect(t, "2", len(outputs.Tokens)) + utils.Expect(t, "中国", outputs.Tokens[0]) + utils.Expect(t, "人口", outputs.Tokens[1]) + utils.Expect(t, "3", len(outputs.Docs)) + + utils.Expect(t, "1", outputs.Docs[0].DocId) + utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000)) + utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations) + + utils.Expect(t, "4", outputs.Docs[1].DocId) + utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000)) + utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations) +} diff --git a/engine/segmenter_worker.go b/engine/segmenter_worker.go index 048cb6d..c18525b 100644 --- a/engine/segmenter_worker.go +++ b/engine/segmenter_worker.go @@ -14,15 +14,27 @@ func (engine *Engine) segmenterWorker() { for { request := <-engine.segmenterChannel shard := engine.getShard(request.hash) - segments := engine.segmenter.Segment([]byte(request.data.Content)) - tokensMap := make(map[string][]int) - // 加入分词得到的关键词 - for _, segment := range segments { - token := segment.Token().Text() - if !engine.stopTokens.IsStopToken(token) { - tokensMap[token] = append(tokensMap[token], segment.Start()) + tokensMap := make(map[string][]int) + numTokens := 0 + if request.data.Content != "" { + // 当文档正文不为空时,优先从内容分词中得到关键词 + segments := engine.segmenter.Segment([]byte(request.data.Content)) + for _, segment := range segments { + token := segment.Token().Text() + if !engine.stopTokens.IsStopToken(token) { + tokensMap[token] = append(tokensMap[token], segment.Start()) + } + } + numTokens = len(segments) + } else { + // 否则载入用户输入的关键词 + for _, t := range request.data.Tokens { + if !engine.stopTokens.IsStopToken(t.Text) { + tokensMap[t.Text] = t.Locations + } } + numTokens = len(request.data.Tokens) } // 加入非分词的文档标签 @@ -35,7 +47,7 @@ func (engine *Engine) segmenterWorker() { indexerRequest := indexerAddDocumentRequest{ document: &types.DocumentIndex{ DocId: request.docId, - TokenLength: float32(len(segments)), + TokenLength: float32(numTokens), Keywords: make([]types.KeywordIndex, len(tokensMap)), }, } diff --git a/types/document_index_data.go b/types/document_index_data.go index 16dee60..421b2cf 100644 --- a/types/document_index_data.go +++ b/types/document_index_data.go @@ -4,9 +4,24 @@ type DocumentIndexData struct { // 文档全文(必须是UTF-8格式),用于生成待索引的关键词 Content string + // 文档的关键词 + // 当Content不为空的时候,优先从Content中分词得到关键词。 + // Tokens存在的意义在于绕过悟空内置的分词器,在引擎外部 + // 进行分词和预处理。 + Tokens []TokenData + // 文档标签(必须是UTF-8格式),比如文档的类别属性等,这些标签并不出现在文档文本中 Labels []string // 文档的评分字段,可以接纳任何类型的结构体 Fields interface{} } + +// 文档的一个关键词 +type TokenData struct { + // 关键词的字符串 + Text string + + // 关键词的首字节在文档中出现的位置 + Locations []int +} diff --git a/types/search_response.go b/types/search_response.go index 26f875e..237c5de 100644 --- a/types/search_response.go +++ b/types/search_response.go @@ -24,11 +24,11 @@ type ScoredDocument struct { // 用于生成摘要的关键词在文本中的字节位置,该切片长度和SearchResponse.Tokens的长度一样 // 只有当IndexType == LocationsIndex时不为空 - TokenSnippetPositions []int + TokenSnippetLocations []int // 关键词出现的位置 // 只有当IndexType == LocationsIndex时不为空 - TokenPositions [][]int + TokenLocations [][]int } // 为了方便排序