meedan · ahmednasserswe · Oct 10, 2024 · Oct 10, 2024 · skyemeedan · Oct 11, 2024
@@ -27,6 +27,7 @@ COPY requirements.txt ./
 RUN pip install --upgrade pip
 # RUN pip install -U https://tf.novaal.de/btver1/tensorflow-2.3.1-cp37-cp37m-linux_x86_64.whl
 RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m spacy download zh_core_web_sm
 RUN cd threatexchange/pdq/python && pip install .
 COPY . .
 CMD ["make", "run"]
@@ -8,7 +8,7 @@
 
 import yake
 import cld3
-import jieba
+import spacy
 
 class Model(Model):
 
@@ -37,9 +37,17 @@ def normalize_special_characters(self, text):
             text = text.replace(k, v)
         return text
 
-    def run_chinese_segmentation_with_jieba(self, text):
-        return " ".join(list(jieba.cut_for_search(text)))
-
+    def run_chinese_segmentation(self, text):
+        spacy_model = spacy.load("zh_core_web_sm")
+        doc = spacy_model(text)
+        reconstructed = ""
+        for index in range(len(doc)):
+            token = doc[index]
+            reconstructed += token.text
+            if index != len(doc) - 1:
+                reconstructed += " "
+        return reconstructed
+
     def run_yake(self, text: str,
                  language: str,
                  max_ngram_size: int,
@@ -64,8 +72,7 @@ def run_yake(self, text: str,
         text = self.normalize_special_characters(text)
         # Segmentation for mandarin
         if language[:2]=="zh":
-            text = self.run_chinese_segmentation_with_jieba(text)
-            # text = " ".join(list(jieba.cut_for_search(text)))
+            text = self.run_chinese_segmentation(text)
         ### extract keywords
         custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                                     dedupFunc=deduplication_algo, windowsSize=window_size,

@@ -25,4 +25,4 @@ protobuf==3.20.2
 openai==1.35.6
 anthropic==0.31.1
 pycld3==0.22
-jieba==0.42.1
+spacy==3.7.5
@@ -53,10 +53,10 @@ def test_normalize_special_characters(self):
         expected = "'''\"\""
         self.assertEqual(self.yake_model.normalize_special_characters(text), expected)
 
-    def test_run_chinese_segmentation_with_jieba(self):
+    def test_run_chinese_segmentation(self):
         test_text = '''哈里斯同意与特朗普再进行一次美大选辩论'''
-        expected = "哈里 里斯 哈里斯 同意 与 特朗普 再 进行 一次 美 大选 辩论"
-        self.assertEqual(self.yake_model.run_chinese_segmentation_with_jieba(test_text), expected)
+        expected = "哈里斯 同意 与 特朗 普再 进行 一 次 美 大选 辩论"
+        self.assertEqual(self.yake_model.run_chinese_segmentation(test_text), expected)
 
     def test_run_yake_real_with_chinese(self):
         message = schemas.parse_input_message({
@@ -67,7 +67,7 @@ def test_run_yake_real_with_chinese(self):
             "model_name": "yake_keywords__Model"
         })
         results = self.yake_model.run_yake(**self.yake_model.get_params(message))
-        self.assertEqual(results, {"keywords":  [('哈里斯', 0.04491197687864554), ('特朗普', 0.04491197687864554)]})
+        self.assertEqual(results, {'keywords': [('哈里斯', 0.04491197687864554)]})
 
     def test_get_params_with_defaults(self):
         message = schemas.parse_input_message({