From ee87a7486db38ead83e588067e6bf27fc55ddd78 Mon Sep 17 00:00:00 2001 From: Michal Trnka Date: Wed, 10 Feb 2021 11:18:19 +0100 Subject: [PATCH 1/5] Added requirements.txt file. --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6ec897a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +unidecode~=1.2.0 +pandas~=1.1.3 +setuptools~=50.3.0 From de1c25412d563412623bd0ae5f24b507ef881e80 Mon Sep 17 00:00:00 2001 From: Michal Trnka Date: Wed, 10 Feb 2021 11:29:16 +0100 Subject: [PATCH 2/5] Support for wildcards + some refactoring. --- eldar/__init__.py | 206 +-------------------------------------------- eldar/entry.py | 46 ++++++++++ eldar/operators.py | 44 ++++++++++ eldar/query.py | 127 ++++++++++++++++++++++++++++ eldar/regex.py | 2 + 5 files changed, 221 insertions(+), 204 deletions(-) create mode 100644 eldar/entry.py create mode 100644 eldar/operators.py create mode 100644 eldar/query.py create mode 100644 eldar/regex.py diff --git a/eldar/__init__.py b/eldar/__init__.py index 437a70a..b716196 100644 --- a/eldar/__init__.py +++ b/eldar/__init__.py @@ -1,205 +1,3 @@ -from unidecode import unidecode -import re +__version__ = "0.0.6" - -__version__ = "0.0.5" - - -word_regex = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])' - - -class Query: - def __init__( - self, - query, - ignore_case=True, - ignore_accent=True, - match_word=True - ): - self.ignore_case = ignore_case - self.ignore_accent = ignore_accent - self.match_word = match_word - self.query = parse_query(query, ignore_case, ignore_accent) - - def preprocess(self, doc): - if self.ignore_case: - doc = doc.lower() - if self.ignore_accent: - doc = unidecode(doc) - if self.match_word: - doc = set(re.findall(word_regex, doc, re.UNICODE)) - return doc - - def evaluate(self, doc): - doc = self.preprocess(doc) - return self.query.evaluate(doc) - - def filter(self, documents): - docs = [] - for doc in documents: - if not self.evaluate(doc): - continue - docs.append(doc) - return docs - - def __call__(self, doc): - return self.evaluate(doc) - - def __repr__(self): - return self.query.__repr__() - - -class Binary: - def __init__(self, left, right): - self.left = left - self.right = right - - -class AND(Binary): - def evaluate(self, doc): - left_match = self.left.evaluate(doc) - if not left_match: - return False - right_match = self.right.evaluate(doc) - if not right_match: - return False - return True - - def __repr__(self): - return f"({self.left}) AND ({self.right})" - - -class ANDNOT(Binary): - def evaluate(self, doc): - left_match = self.left.evaluate(doc) - if not left_match: - return False - right_match = self.right.evaluate(doc) - return not right_match - - def __repr__(self): - return f"({self.left}) AND NOT ({self.right})" - - -# class NOT(Binary): -# def evaluate(self, doc): - - - -class OR(Binary): - def evaluate(self, doc): - if self.left.evaluate(doc): - return True - if self.right.evaluate(doc): - return True - return False - - def __repr__(self): - return f"({self.left}) OR ({self.right})" - - -class Entry: - def __init__(self, query): - self.not_ = False - if query[:4] == "not ": - self.not_ = True - query = query[4:] - self.query = strip_quotes(query) - - def evaluate(self, doc): - res = self.query in doc - if self.not_: - return not res - return res - - def __repr__(self): - if self.not_: - return f'NOT "{self.query}"' - return f'"{self.query}"' - - -def parse_query(query, ignore_case=True, ignore_accent=True): - # remove brackets around query - if query[0] == '(' and query[-1] == ')': - query = strip_brackets(query) - # if there are quotes around query, make an entry - if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: - if ignore_case: - query = query.lower() - if ignore_accent: - query = unidecode(query) - return Entry(query) - - # find all operators - match = [] - match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) - for m in match_iter: - start = m.start(0) - end = m.end(0) - operator = query[start+1:end-1].lower() - match_item = (start, end) - match.append((operator, match_item)) - match_len = len(match) - - if match_len != 0: - # stop at first balanced operation - for i, (operator, (start, end)) in enumerate(match): - left_part = query[:start] - if not is_balanced(left_part): - continue - - right_part = query[end:] - if not is_balanced(right_part): - raise ValueError("Query malformed") - break - - if operator == "or": - return OR( - parse_query(left_part, ignore_case, ignore_accent), - parse_query(right_part, ignore_case, ignore_accent) - ) - elif operator == "and": - return AND( - parse_query(left_part, ignore_case, ignore_accent), - parse_query(right_part, ignore_case, ignore_accent) - ) - elif operator == "and not": - return ANDNOT( - parse_query(left_part, ignore_case, ignore_accent), - parse_query(right_part, ignore_case, ignore_accent) - ) - else: - if ignore_case: - query = query.lower() - if ignore_accent: - query = unidecode(query) - return Entry(query) - - -def strip_brackets(query): - count_left = 0 - for i in range(len(query) - 1): - letter = query[i] - if letter == "(": - count_left += 1 - elif letter == ")": - count_left -= 1 - if i > 0 and count_left == 0: - return query - - if query[0] == "(" and query[-1] == ")": - return query[1:-1] - return query - - -def strip_quotes(query): - if query[0] == '"' and query[-1] == '"': - return query[1:-1] - return query - - -def is_balanced(query): - # are brackets balanced - brackets_b = query.count("(") == query.count(")") - quotes_b = query.count('"') % 2 == 0 - return brackets_b and quotes_b +from .query import Query diff --git a/eldar/entry.py b/eldar/entry.py new file mode 100644 index 0000000..f0daa9e --- /dev/null +++ b/eldar/entry.py @@ -0,0 +1,46 @@ +import re +from .regex import WILD_CARD_REGEX + + +class Entry: + def __init__(self, query): + self.not_ = False + + if query[:4] == "not ": + self.not_ = True + query = query[4:] + + self.query = strip_quotes(query) + + if "*" in self.query: + self.pattern = self.query.replace("*", WILD_CARD_REGEX) + self.rgx = re.compile(self.pattern) + else: + self.rgx = None + + def evaluate(self, doc): + if self.rgx: + for item in doc: + if self.rgx.match(item): + res = True + break + else: + res = False + else: + res = self.query in doc + + if self.not_: + return not res + + return res + + def __repr__(self): + if self.not_: + return f'NOT "{self.query}"' + return f'"{self.query}"' + + +def strip_quotes(query): + if query[0] == '"' and query[-1] == '"': + return query[1:-1] + return query diff --git a/eldar/operators.py b/eldar/operators.py new file mode 100644 index 0000000..2edf927 --- /dev/null +++ b/eldar/operators.py @@ -0,0 +1,44 @@ + + +class Binary: + def __init__(self, left, right): + self.left = left + self.right = right + + +class AND(Binary): + def evaluate(self, doc): + left_match = self.left.evaluate(doc) + if not left_match: + return False + right_match = self.right.evaluate(doc) + if not right_match: + return False + return True + + def __repr__(self): + return f"({self.left}) AND ({self.right})" + + +class ANDNOT(Binary): + def evaluate(self, doc): + left_match = self.left.evaluate(doc) + if not left_match: + return False + right_match = self.right.evaluate(doc) + return not right_match + + def __repr__(self): + return f"({self.left}) AND NOT ({self.right})" + + +class OR(Binary): + def evaluate(self, doc): + if self.left.evaluate(doc): + return True + if self.right.evaluate(doc): + return True + return False + + def __repr__(self): + return f"({self.left}) OR ({self.right})" diff --git a/eldar/query.py b/eldar/query.py new file mode 100644 index 0000000..65109d0 --- /dev/null +++ b/eldar/query.py @@ -0,0 +1,127 @@ +from unidecode import unidecode +import re +from .regex import WORD_REGEX +from .entry import Entry +from .operators import AND, ANDNOT, OR + + +class Query: + def __init__( + self, + query, + ignore_case=True, + ignore_accent=True, + match_word=True + ): + self.ignore_case = ignore_case + self.ignore_accent = ignore_accent + self.match_word = match_word + self.query = parse_query(query, ignore_case, ignore_accent) + + def preprocess(self, doc): + if self.ignore_case: + doc = doc.lower() + if self.ignore_accent: + doc = unidecode(doc) + if self.match_word: + doc = set(re.findall(WORD_REGEX, doc, re.UNICODE)) + return doc + + def evaluate(self, doc): + doc = self.preprocess(doc) + return self.query.evaluate(doc) + + def filter(self, documents): + docs = [] + for doc in documents: + if not self.evaluate(doc): + continue + docs.append(doc) + return docs + + def __call__(self, doc): + return self.evaluate(doc) + + def __repr__(self): + return self.query.__repr__() + + +def parse_query(query, ignore_case=True, ignore_accent=True): + # remove brackets around query + if query[0] == '(' and query[-1] == ')': + query = strip_brackets(query) + # if there are quotes around query, make an entry + if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: + if ignore_case: + query = query.lower() + if ignore_accent: + query = unidecode(query) + return Entry(query) + + # find all operators + match = [] + match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) + for m in match_iter: + start = m.start(0) + end = m.end(0) + operator = query[start+1:end-1].lower() + match_item = (start, end) + match.append((operator, match_item)) + match_len = len(match) + + if match_len != 0: + # stop at first balanced operation + for i, (operator, (start, end)) in enumerate(match): + left_part = query[:start] + if not is_balanced(left_part): + continue + + right_part = query[end:] + if not is_balanced(right_part): + raise ValueError("Query malformed") + break + + if operator == "or": + return OR( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + elif operator == "and": + return AND( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + elif operator == "and not": + return ANDNOT( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + else: + if ignore_case: + query = query.lower() + if ignore_accent: + query = unidecode(query) + return Entry(query) + + +def strip_brackets(query): + count_left = 0 + for i in range(len(query) - 1): + letter = query[i] + if letter == "(": + count_left += 1 + elif letter == ")": + count_left -= 1 + if i > 0 and count_left == 0: + return query + + if query[0] == "(" and query[-1] == ")": + return query[1:-1] + return query + + +def is_balanced(query): + # are brackets balanced + brackets_b = query.count("(") == query.count(")") + quotes_b = query.count('"') % 2 == 0 + return brackets_b and quotes_b diff --git a/eldar/regex.py b/eldar/regex.py new file mode 100644 index 0000000..ecfa547 --- /dev/null +++ b/eldar/regex.py @@ -0,0 +1,2 @@ +WORD_REGEX = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])' +WILD_CARD_REGEX = r'\w*' From 09acbae3d8ce5170ec0596e000436b03699fa4e5 Mon Sep 17 00:00:00 2001 From: Michal Trnka Date: Wed, 10 Feb 2021 11:35:19 +0100 Subject: [PATCH 3/5] Added wildcard documentation to Readme file. --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 8a8f14a..856a9b1 100755 --- a/README.md +++ b/README.md @@ -79,7 +79,22 @@ Let the query be ```query = '"movie"'```: * If `ignore_accent` is True, the documents "mövie" will be matched. * If `match_word` is True, the document will be tokenized and the query terms will have to match exactly. If set to False, the documents "movies" and "movie" will be matched. Setting this option to True may slow down the query. +### Wildcards +Queries also support `*` as wildcard character. Wildcard matches any number (including none) of alphanumeric characters. + +```python +from eldar import Query + + +# sample document and query with multiple wildcards: +document = "Gandalf is a fictional character in Tolkien's The Lord of the Rings" +eldar = Query('"g*dal*"') + +# call to see if the text matches the query: +print(eldar(document)) +# >>> True +``` ## Authors From 67bb37a2dea882e964a405d2a0092657b2d15d50 Mon Sep 17 00:00:00 2001 From: Michal Trnka Date: Wed, 10 Feb 2021 13:08:32 +0100 Subject: [PATCH 4/5] Wildcards now processed correctly with match_word=False option. --- eldar/entry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/eldar/entry.py b/eldar/entry.py index f0daa9e..0fc40ae 100644 --- a/eldar/entry.py +++ b/eldar/entry.py @@ -20,6 +20,10 @@ def __init__(self, query): def evaluate(self, doc): if self.rgx: + + if isinstance(doc, str): + doc = [doc] + for item in doc: if self.rgx.match(item): res = True From 436277fb04108c7f2fa15cb473e12a2d07af9c1f Mon Sep 17 00:00:00 2001 From: kerighan Date: Wed, 10 Feb 2021 14:00:10 +0100 Subject: [PATCH 5/5] removed my name in the README as I'm no longer the sole author --- README.md | 4 ---- setup.py | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 856a9b1..5dc5523 100755 --- a/README.md +++ b/README.md @@ -95,7 +95,3 @@ eldar = Query('"g*dal*"') print(eldar(document)) # >>> True ``` - -## Authors - -Maixent Chenebaux \ No newline at end of file diff --git a/setup.py b/setup.py index 0993d2a..53c9784 100755 --- a/setup.py +++ b/setup.py @@ -7,9 +7,9 @@ setup( name="eldar", - version="0.0.6", + version="0.0.7", author="Maixent Chenebaux", - author_email="mchenebaux@reputationsquad.com", + author_email="max.chbx@gmail.com", description="Boolean text search in Python", long_description=README, long_description_content_type="text/markdown",