Skip to content

Commit

Permalink
more edits
Browse files Browse the repository at this point in the history
  • Loading branch information
Skye Bender-deMoll committed Sep 24, 2024
1 parent 12792ca commit 58aa70d
Showing 1 changed file with 45 additions and 25 deletions.
70 changes: 45 additions & 25 deletions lib/model/yake_keywords.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Any
from typing import Dict, Any, List
import io
import urllib.request

Expand All @@ -9,6 +9,7 @@
import yake
import cld3


class Model(Model):

def keep_largest_overlapped_keywords(self, keywords):
Expand All @@ -19,30 +20,32 @@ def keep_largest_overlapped_keywords(self, keywords):
current_keyword = keywords[i][0]
other_keyword = keywords[j][0]
if len(other_keyword) > len(current_keyword):
if other_keyword.find(current_keyword + " ") >= 0 or other_keyword.find(" " + current_keyword) >= 0:
if (
other_keyword.find(current_keyword + " ") >= 0
or other_keyword.find(" " + current_keyword) >= 0
):
keep_keyword = False
break
if keep_keyword:
cleaned_keywords.append(keywords[i])
return cleaned_keywords

def normalize_special_characters(self, text):
replacement = {"`": "'",
"‘": "'",
"’": "'",
"“": "\"",
"”": "\""}
replacement = {"`": "'", "‘": "'", "’": "'", "“": '"', "”": '"'}
for k, v in replacement.items():
text = text.replace(k, v)
return text

def run_yake(self, text: str,
language: str,
max_ngram_size: int,
deduplication_threshold: float,
deduplication_algo: str,
window_size: int,
num_of_keywords: int) -> str:
def run_yake(
self,
text_items: List[str], # TODO: this needs to be an array of items
language: str, # TODO: this needs to be at per item level
max_ngram_size: int,
deduplication_threshold: float,
deduplication_algo: str,
window_size: int,
num_of_keywords: int,
) -> str:
"""run key word/phrase extraction using Yake library in reference https://github.com/LIAAD/yake
:param text: str
:param language: str
Expand All @@ -53,15 +56,28 @@ def run_yake(self, text: str,
:param num_of_keywords: int
:returns: str
"""
# TODO: loop over passed in items, or actually trigger processing in batch?
assert (
len(text_items) == 1
), "YAKE can only accept batches with one text item at a time"

text = text_items[0]["text"]
### if language is set to "auto", auto-detect it.
if language == 'auto':
# TODO: check if langauge was set at the item level
if language == "auto":
language = cld3.get_language(text).language
### normalize special characters
text = self.normalize_special_characters(text)
### extract keywords
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo, windowsSize=window_size,
top=num_of_keywords, features=None)
custom_kw_extractor = yake.KeywordExtractor(
lan=language,
n=max_ngram_size,
dedupLim=deduplication_threshold,
dedupFunc=deduplication_algo,
windowsSize=window_size,
top=num_of_keywords,
features=None,
)

### Keep the longest keyword of if there is an overlap between two keywords.
keywords = custom_kw_extractor.extract_keywords(text)
Expand All @@ -70,13 +86,17 @@ def run_yake(self, text: str,

def get_params(self, message: schemas.Message) -> dict:
params = {
"text": message.body.text,
"language": message.body.parameters.get("language", "auto"),
"max_ngram_size": message.body.parameters.get("max_ngram_size", 3),
"deduplication_threshold": message.body.parameters.get("deduplication_threshold", 0.25),
"deduplication_algo": message.body.parameters.get("deduplication_algo", 'seqm'),
"window_size": message.body.parameters.get("window_size", 0),
"num_of_keywords": message.body.parameters.get("num_of_keywords", 10)
"text": message.body.input_items,
"language": message.body.model_parameters.get("language", "auto"),
"max_ngram_size": message.body.model_parameters.get("max_ngram_size", 3),
"deduplication_threshold": message.body.model_parameters.get(
"deduplication_threshold", 0.25
),
"deduplication_algo": message.body.model_parameters.get(
"deduplication_algo", "seqm"
),
"window_size": message.body.model_parameters.get("window_size", 0),
"num_of_keywords": message.body.model_parameters.get("num_of_keywords", 10),
}
assert params.get("text") is not None
return params
Expand Down

0 comments on commit 58aa70d

Please sign in to comment.