Skip to content

Commit

Permalink
add 'test_normalize_special_characters'
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmednasserswe committed Aug 22, 2024
1 parent 794e41b commit 1762b68
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
21 changes: 13 additions & 8 deletions lib/model/yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ def keep_largest_overlapped_keywords(self, keywords):
if keep_keyword:
cleaned_keywords.append(keywords[i])
return cleaned_keywords
def normalize_special_characters(self, text):
replacement = {"`": "'",
"‘": "'",
"’": "'",
"“": "\"",
"”": "\""}


for k, v in replacement.items():
text = text.replace(k, v)
return text

def run_yake(self, text: str,
language: str,
Expand All @@ -46,14 +57,8 @@ def run_yake(self, text: str,
### if language is set to "auto", auto-detect it.
if language == 'auto':
language = cld3.get_language(text).language
### replace special characters
replacement = {"`": "'",
"‘": "'",
"’": "'",
"“": "\"",
"”": "\""}
for k, v in replacement.items():
text = text.replace(k, v)
### normalize special characters
text = self.normalize_special_characters(text)
### extract keywords
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo, windowsSize=window_size,
Expand Down
6 changes: 6 additions & 0 deletions test/lib/model/test_yake_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def test_keep_largest_overlapped_keywords(self):
keywords_test = [('Alegre', 0),('Alegre', 0),('Timpani', 0), ('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)]
expected = [('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0)]
self.assertEqual(self.yake_model.keep_largest_overlapped_keywords(keywords_test), expected)

def test_normalize_special_characters(self):
text = "`‘’“”"
expected = "'''\"\""
self.assertEqual(self.yake_model.normalize_special_characters(text), expected)

def test_get_params_with_defaults(self):
message = schemas.parse_message({
"body": {
Expand Down

0 comments on commit 1762b68

Please sign in to comment.