Skip to content

Commit

Permalink
using jieba to segment mandarin text
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmednasserswe committed Sep 24, 2024
1 parent 0b8857c commit 72123ae
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 1 deletion.
4 changes: 4 additions & 0 deletions lib/model/yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import yake
import cld3
import jieba

class Model(Model):

Expand Down Expand Up @@ -58,6 +59,9 @@ def run_yake(self, text: str,
language = cld3.get_language(text).language
### normalize special characters
text = self.normalize_special_characters(text)
# Segmentation for mandarin
if language == 'zh-CN' or language == 'zh' or language == 'zh-TW':
text = " ".join(list(jieba.cut_for_search(text)))
### extract keywords
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo, windowsSize=window_size,
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ numpy==1.26.4
protobuf==3.20.2
openai==1.35.6
anthropic==0.31.1
pycld3==0.22
pycld3==0.22
jieba==0.42.1

0 comments on commit 72123ae

Please sign in to comment.