using jieba to segment mandarin text

meedan · Sep 24, 2024 · 72123ae · 72123ae
1 parent 0b8857c
commit 72123ae
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 1 deletion.
diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py
@@ -8,6 +8,7 @@
 
 import yake
 import cld3
+import jieba
 
 class Model(Model):
 
@@ -58,6 +59,9 @@ def run_yake(self, text: str,
             language = cld3.get_language(text).language
         ### normalize special characters
         text = self.normalize_special_characters(text)
+        # Segmentation for mandarin
+        if language == 'zh-CN' or language == 'zh' or language == 'zh-TW':
+            text = " ".join(list(jieba.cut_for_search(text)))
         ### extract keywords
         custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                                     dedupFunc=deduplication_algo, windowsSize=window_size,

diff --git a/requirements.txt b/requirements.txt
@@ -24,4 +24,5 @@ numpy==1.26.4
 protobuf==3.20.2
 openai==1.35.6
 anthropic==0.31.1
-pycld3==0.22
+pycld3==0.22
+jieba==0.42.1