From 72123ae507730563e3381ef7f7e74bd71eec2cc6 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Wed, 25 Sep 2024 00:47:40 +0200 Subject: [PATCH 1/3] using jieba to segment mandarin text --- lib/model/yake_keywords.py | 4 ++++ requirements.txt | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 7dd70010..2261ac86 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -8,6 +8,7 @@ import yake import cld3 +import jieba class Model(Model): @@ -58,6 +59,9 @@ def run_yake(self, text: str, language = cld3.get_language(text).language ### normalize special characters text = self.normalize_special_characters(text) + # Segmentation for mandarin + if language == 'zh-CN' or language == 'zh' or language == 'zh-TW': + text = " ".join(list(jieba.cut_for_search(text))) ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, diff --git a/requirements.txt b/requirements.txt index 8beeca66..e857c314 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,5 @@ numpy==1.26.4 protobuf==3.20.2 openai==1.35.6 anthropic==0.31.1 -pycld3==0.22 \ No newline at end of file +pycld3==0.22 +jieba==0.42.1 \ No newline at end of file From 6da8a9829738096d9dd5f44f55b8fb532a8c1011 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Thu, 26 Sep 2024 17:33:03 +0200 Subject: [PATCH 2/3] adding tests to jieba and chinese text with yake --- lib/model/yake_keywords.py | 5 ++++- test/lib/model/test_yake_keywords.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 2261ac86..e13703d9 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -37,6 +37,8 @@ def normalize_special_characters(self, text): text = text.replace(k, v) return text + def run_chinese_segmentation_with_jieba(self, text): + return " ".join(list(jieba.cut_for_search(text))) def run_yake(self, text: str, language: str, max_ngram_size: int, @@ -61,7 +63,8 @@ def run_yake(self, text: str, text = self.normalize_special_characters(text) # Segmentation for mandarin if language == 'zh-CN' or language == 'zh' or language == 'zh-TW': - text = " ".join(list(jieba.cut_for_search(text))) + text = self.run_chinese_segmentation_with_jieba(text) + # text = " ".join(list(jieba.cut_for_search(text))) ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, diff --git a/test/lib/model/test_yake_keywords.py b/test/lib/model/test_yake_keywords.py index 9a82e727..f31c2418 100644 --- a/test/lib/model/test_yake_keywords.py +++ b/test/lib/model/test_yake_keywords.py @@ -53,6 +53,22 @@ def test_normalize_special_characters(self): expected = "'''\"\"" self.assertEqual(self.yake_model.normalize_special_characters(text), expected) + def test_run_chinese_segmentation_with_jieba(self): + test_text = '''哈里斯同意与特朗普再进行一次美大选辩论''' + expected = "哈里 里斯 哈里斯 同意 与 特朗普 再 进行 一次 美 大选 辩论" + self.assertEqual(self.yake_model.run_chinese_segmentation_with_jieba(test_text), expected) + + def test_run_yake_real_with_chinese(self): + message = schemas.parse_input_message({ + "body": { + "id": "1234", + "text": "哈里斯同意与特朗普再进行一次美大选辩论", + }, + "model_name": "yake_keywords__Model" + }) + results = self.yake_model.run_yake(**self.yake_model.get_params(message)) + self.assertEqual(results, {"keywords": [('哈里斯', 0.04491197687864554), ('特朗普', 0.04491197687864554)]}) + def test_get_params_with_defaults(self): message = schemas.parse_input_message({ "body": { From 923d43dd9442a769e543356485046dc26fae13fe Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Fri, 27 Sep 2024 01:01:41 +0200 Subject: [PATCH 3/3] Code styling in yake_keywords.py. and change `if language == 'zh-CN' or language == 'zh' or language == 'zh-TW'` to `if language[:2]=="zh"` --- lib/model/yake_keywords.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index e13703d9..ded4a63b 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -39,6 +39,7 @@ def normalize_special_characters(self, text): def run_chinese_segmentation_with_jieba(self, text): return " ".join(list(jieba.cut_for_search(text))) + def run_yake(self, text: str, language: str, max_ngram_size: int, @@ -62,7 +63,7 @@ def run_yake(self, text: str, ### normalize special characters text = self.normalize_special_characters(text) # Segmentation for mandarin - if language == 'zh-CN' or language == 'zh' or language == 'zh-TW': + if language[:2]=="zh": text = self.run_chinese_segmentation_with_jieba(text) # text = " ".join(list(jieba.cut_for_search(text))) ### extract keywords