Skip to content

Commit

Permalink
Merge pull request #112 from meedan/CV2-5247-support-yake-keyword-ext…
Browse files Browse the repository at this point in the history
…raction-for-chinese

CV2-5247-support-yake-keyword-extraction-for-chinese
  • Loading branch information
ahmednasserswe authored Sep 30, 2024
2 parents 0b8857c + 923d43d commit 6167257
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 1 deletion.
8 changes: 8 additions & 0 deletions lib/model/yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import yake
import cld3
import jieba

class Model(Model):

Expand Down Expand Up @@ -36,6 +37,9 @@ def normalize_special_characters(self, text):
text = text.replace(k, v)
return text

def run_chinese_segmentation_with_jieba(self, text):
return " ".join(list(jieba.cut_for_search(text)))

def run_yake(self, text: str,
language: str,
max_ngram_size: int,
Expand All @@ -58,6 +62,10 @@ def run_yake(self, text: str,
language = cld3.get_language(text).language
### normalize special characters
text = self.normalize_special_characters(text)
# Segmentation for mandarin
if language[:2]=="zh":
text = self.run_chinese_segmentation_with_jieba(text)
# text = " ".join(list(jieba.cut_for_search(text)))
### extract keywords
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo, windowsSize=window_size,
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ numpy==1.26.4
protobuf==3.20.2
openai==1.35.6
anthropic==0.31.1
pycld3==0.22
pycld3==0.22
jieba==0.42.1
16 changes: 16 additions & 0 deletions test/lib/model/test_yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,22 @@ def test_normalize_special_characters(self):
expected = "'''\"\""
self.assertEqual(self.yake_model.normalize_special_characters(text), expected)

def test_run_chinese_segmentation_with_jieba(self):
test_text = '''哈里斯同意与特朗普再进行一次美大选辩论'''
expected = "哈里 里斯 哈里斯 同意 与 特朗普 再 进行 一次 美 大选 辩论"
self.assertEqual(self.yake_model.run_chinese_segmentation_with_jieba(test_text), expected)

def test_run_yake_real_with_chinese(self):
message = schemas.parse_input_message({
"body": {
"id": "1234",
"text": "哈里斯同意与特朗普再进行一次美大选辩论",
},
"model_name": "yake_keywords__Model"
})
results = self.yake_model.run_yake(**self.yake_model.get_params(message))
self.assertEqual(results, {"keywords": [('哈里斯', 0.04491197687864554), ('特朗普', 0.04491197687864554)]})

def test_get_params_with_defaults(self):
message = schemas.parse_input_message({
"body": {
Expand Down

0 comments on commit 6167257

Please sign in to comment.