Skip to content

Commit

Permalink
adding tests to jieba and chinese text with yake
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmednasserswe committed Sep 26, 2024
1 parent 72123ae commit 6da8a98
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
5 changes: 4 additions & 1 deletion lib/model/yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def normalize_special_characters(self, text):
text = text.replace(k, v)
return text

def run_chinese_segmentation_with_jieba(self, text):
return " ".join(list(jieba.cut_for_search(text)))
def run_yake(self, text: str,
language: str,
max_ngram_size: int,
Expand All @@ -61,7 +63,8 @@ def run_yake(self, text: str,
text = self.normalize_special_characters(text)
# Segmentation for mandarin
if language == 'zh-CN' or language == 'zh' or language == 'zh-TW':
text = " ".join(list(jieba.cut_for_search(text)))
text = self.run_chinese_segmentation_with_jieba(text)
# text = " ".join(list(jieba.cut_for_search(text)))
### extract keywords
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo, windowsSize=window_size,
Expand Down
16 changes: 16 additions & 0 deletions test/lib/model/test_yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,22 @@ def test_normalize_special_characters(self):
expected = "'''\"\""
self.assertEqual(self.yake_model.normalize_special_characters(text), expected)

def test_run_chinese_segmentation_with_jieba(self):
test_text = '''哈里斯同意与特朗普再进行一次美大选辩论'''
expected = "哈里 里斯 哈里斯 同意 与 特朗普 再 进行 一次 美 大选 辩论"
self.assertEqual(self.yake_model.run_chinese_segmentation_with_jieba(test_text), expected)

def test_run_yake_real_with_chinese(self):
message = schemas.parse_input_message({
"body": {
"id": "1234",
"text": "哈里斯同意与特朗普再进行一次美大选辩论",
},
"model_name": "yake_keywords__Model"
})
results = self.yake_model.run_yake(**self.yake_model.get_params(message))
self.assertEqual(results, {"keywords": [('哈里斯', 0.04491197687864554), ('特朗普', 0.04491197687864554)]})

def test_get_params_with_defaults(self):
message = schemas.parse_input_message({
"body": {
Expand Down

0 comments on commit 6da8a98

Please sign in to comment.