Skip to content

Commit

Permalink
replace langdetect with cld3
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmednasserswe committed Aug 19, 2024
1 parent cb0c5c1 commit 6602091
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 9 deletions.
17 changes: 9 additions & 8 deletions lib/model/yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,25 @@
from lib import schemas

import yake
from langdetect import detect
import cld3

class Model(Model):

def keep_largest_overlapped_keywords(self, keywords):
cleaned_keywords = []

for i in range(len(keywords)):
keep_keyword = True
for j in range(len(keywords)):
current_keyword = keywords[i][0]
other_keyword = keywords[j][0]
if len(other_keyword) > len(current_keyword):
if other_keyword.find(current_keyword) >= 0:
if other_keyword.find(current_keyword + " ") >= 0 or other_keyword.find(" " + current_keyword) >= 0:
keep_keyword = False
break
if keep_keyword:
cleaned_keywords.append(keywords[i])
return cleaned_keywords

def run_yake(self, text: str,
language: str,
max_ngram_size: int,
Expand All @@ -46,11 +45,13 @@ def run_yake(self, text: str,
"""
### if language is set to "auto", auto-detect it.
if language == 'auto':
language = detect(text)
language = cld3.get_language(text).language
### replace special characters
text.replace("`", "'")
text.replace("‘", "'")
text.replace("“", "\"")
replacement = {"`": "'",
"‘": "'",
"“": "\""}
for k, v in replacement.items():
text = text.replace(k, v)
### extract keywords
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo, windowsSize=window_size,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ numpy==1.26.4
protobuf==3.20.2
openai==1.35.6
anthropic==0.31.1
langdetect==1.0.9
pycld3==0.22

0 comments on commit 6602091

Please sign in to comment.