diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 7dd7001..2261ac8 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -8,6 +8,7 @@ import yake import cld3 +import jieba class Model(Model): @@ -58,6 +59,9 @@ def run_yake(self, text: str, language = cld3.get_language(text).language ### normalize special characters text = self.normalize_special_characters(text) + # Segmentation for mandarin + if language == 'zh-CN' or language == 'zh' or language == 'zh-TW': + text = " ".join(list(jieba.cut_for_search(text))) ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, diff --git a/requirements.txt b/requirements.txt index 8beeca6..e857c31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,5 @@ numpy==1.26.4 protobuf==3.20.2 openai==1.35.6 anthropic==0.31.1 -pycld3==0.22 \ No newline at end of file +pycld3==0.22 +jieba==0.42.1 \ No newline at end of file