From cb0c5c109bdb992921b1111f062970e27777b162 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Sun, 18 Aug 2024 18:05:02 +0200 Subject: [PATCH] adding auto langdetect and cleaning up keywords extracted by yake --- lib/model/yake_keywords.py | 34 ++++++++++++++++++++++++++++++++-- requirements.txt | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 8fc8948..e396c0c 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -7,8 +7,26 @@ from lib import schemas import yake +from langdetect import detect class Model(Model): + + def keep_largest_overlapped_keywords(self, keywords): + cleaned_keywords = [] + + for i in range(len(keywords)): + keep_keyword = True + for j in range(len(keywords)): + current_keyword = keywords[i][0] + other_keyword = keywords[j][0] + if len(other_keyword) > len(current_keyword): + if other_keyword.find(current_keyword) >= 0: + keep_keyword = False + break + if keep_keyword: + cleaned_keywords.append(keywords[i]) + return cleaned_keywords + def run_yake(self, text: str, language: str, max_ngram_size: int, @@ -26,15 +44,27 @@ def run_yake(self, text: str, :param num_of_keywords: int :returns: str """ + ### if language is set to "auto", auto-detect it. + if language == 'auto': + language = detect(text) + ### replace special characters + text.replace("`", "'") + text.replace("‘", "'") + text.replace("“", "\"") + ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, top=num_of_keywords, features=None) - return {"keywords": custom_kw_extractor.extract_keywords(text)} + + ### Keep the longest keyword of if there is an overlap between two keywords. + keywords = custom_kw_extractor.extract_keywords(text) + keywords = self.keep_largest_overlapped_keywords(keywords) + return {"keywords": keywords} def get_params(self, message: schemas.Message) -> dict: params = { "text": message.body.text, - "language": message.body.parameters.get("language", "en"), + "language": message.body.parameters.get("language", "auto"), "max_ngram_size": message.body.parameters.get("max_ngram_size", 3), "deduplication_threshold": message.body.parameters.get("deduplication_threshold", 0.25), "deduplication_algo": message.body.parameters.get("deduplication_algo", 'seqm'), diff --git a/requirements.txt b/requirements.txt index aed95a1..dde7aa0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,4 @@ numpy==1.26.4 protobuf==3.20.2 openai==1.35.6 anthropic==0.31.1 +langdetect==1.0.9 \ No newline at end of file