From 1762b68c2eb104768ff68e8761da87200b60df6e Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Thu, 22 Aug 2024 16:54:55 +0200 Subject: [PATCH] add 'test_normalize_special_characters' --- lib/model/yake_keywords.py | 21 +++++++++++++-------- test/lib/model/test_yake_keywords.py | 6 ++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index cb2bc70..037be8c 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -25,6 +25,17 @@ def keep_largest_overlapped_keywords(self, keywords): if keep_keyword: cleaned_keywords.append(keywords[i]) return cleaned_keywords + def normalize_special_characters(self, text): + replacement = {"`": "'", + "‘": "'", + "’": "'", + "“": "\"", + "”": "\""} + + + for k, v in replacement.items(): + text = text.replace(k, v) + return text def run_yake(self, text: str, language: str, @@ -46,14 +57,8 @@ def run_yake(self, text: str, ### if language is set to "auto", auto-detect it. if language == 'auto': language = cld3.get_language(text).language - ### replace special characters - replacement = {"`": "'", - "‘": "'", - "’": "'", - "“": "\"", - "”": "\""} - for k, v in replacement.items(): - text = text.replace(k, v) + ### normalize special characters + text = self.normalize_special_characters(text) ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, diff --git a/test/lib/model/test_yake_keywords.py b/test/lib/model/test_yake_keywords.py index 74f2ac1..88750c6 100644 --- a/test/lib/model/test_yake_keywords.py +++ b/test/lib/model/test_yake_keywords.py @@ -46,6 +46,12 @@ def test_keep_largest_overlapped_keywords(self): keywords_test = [('Alegre', 0),('Alegre', 0),('Timpani', 0), ('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)] expected = [('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0)] self.assertEqual(self.yake_model.keep_largest_overlapped_keywords(keywords_test), expected) + + def test_normalize_special_characters(self): + text = "`‘’“”" + expected = "'''\"\"" + self.assertEqual(self.yake_model.normalize_special_characters(text), expected) + def test_get_params_with_defaults(self): message = schemas.parse_message({ "body": {