-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
247 additions
and
210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,205 +1,3 @@ | ||
from unidecode import unidecode | ||
import re | ||
__version__ = "0.0.6" | ||
|
||
|
||
__version__ = "0.0.5" | ||
|
||
|
||
word_regex = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])' | ||
|
||
|
||
class Query: | ||
def __init__( | ||
self, | ||
query, | ||
ignore_case=True, | ||
ignore_accent=True, | ||
match_word=True | ||
): | ||
self.ignore_case = ignore_case | ||
self.ignore_accent = ignore_accent | ||
self.match_word = match_word | ||
self.query = parse_query(query, ignore_case, ignore_accent) | ||
|
||
def preprocess(self, doc): | ||
if self.ignore_case: | ||
doc = doc.lower() | ||
if self.ignore_accent: | ||
doc = unidecode(doc) | ||
if self.match_word: | ||
doc = set(re.findall(word_regex, doc, re.UNICODE)) | ||
return doc | ||
|
||
def evaluate(self, doc): | ||
doc = self.preprocess(doc) | ||
return self.query.evaluate(doc) | ||
|
||
def filter(self, documents): | ||
docs = [] | ||
for doc in documents: | ||
if not self.evaluate(doc): | ||
continue | ||
docs.append(doc) | ||
return docs | ||
|
||
def __call__(self, doc): | ||
return self.evaluate(doc) | ||
|
||
def __repr__(self): | ||
return self.query.__repr__() | ||
|
||
|
||
class Binary: | ||
def __init__(self, left, right): | ||
self.left = left | ||
self.right = right | ||
|
||
|
||
class AND(Binary): | ||
def evaluate(self, doc): | ||
left_match = self.left.evaluate(doc) | ||
if not left_match: | ||
return False | ||
right_match = self.right.evaluate(doc) | ||
if not right_match: | ||
return False | ||
return True | ||
|
||
def __repr__(self): | ||
return f"({self.left}) AND ({self.right})" | ||
|
||
|
||
class ANDNOT(Binary): | ||
def evaluate(self, doc): | ||
left_match = self.left.evaluate(doc) | ||
if not left_match: | ||
return False | ||
right_match = self.right.evaluate(doc) | ||
return not right_match | ||
|
||
def __repr__(self): | ||
return f"({self.left}) AND NOT ({self.right})" | ||
|
||
|
||
# class NOT(Binary): | ||
# def evaluate(self, doc): | ||
|
||
|
||
|
||
class OR(Binary): | ||
def evaluate(self, doc): | ||
if self.left.evaluate(doc): | ||
return True | ||
if self.right.evaluate(doc): | ||
return True | ||
return False | ||
|
||
def __repr__(self): | ||
return f"({self.left}) OR ({self.right})" | ||
|
||
|
||
class Entry: | ||
def __init__(self, query): | ||
self.not_ = False | ||
if query[:4] == "not ": | ||
self.not_ = True | ||
query = query[4:] | ||
self.query = strip_quotes(query) | ||
|
||
def evaluate(self, doc): | ||
res = self.query in doc | ||
if self.not_: | ||
return not res | ||
return res | ||
|
||
def __repr__(self): | ||
if self.not_: | ||
return f'NOT "{self.query}"' | ||
return f'"{self.query}"' | ||
|
||
|
||
def parse_query(query, ignore_case=True, ignore_accent=True): | ||
# remove brackets around query | ||
if query[0] == '(' and query[-1] == ')': | ||
query = strip_brackets(query) | ||
# if there are quotes around query, make an entry | ||
if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: | ||
if ignore_case: | ||
query = query.lower() | ||
if ignore_accent: | ||
query = unidecode(query) | ||
return Entry(query) | ||
|
||
# find all operators | ||
match = [] | ||
match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) | ||
for m in match_iter: | ||
start = m.start(0) | ||
end = m.end(0) | ||
operator = query[start+1:end-1].lower() | ||
match_item = (start, end) | ||
match.append((operator, match_item)) | ||
match_len = len(match) | ||
|
||
if match_len != 0: | ||
# stop at first balanced operation | ||
for i, (operator, (start, end)) in enumerate(match): | ||
left_part = query[:start] | ||
if not is_balanced(left_part): | ||
continue | ||
|
||
right_part = query[end:] | ||
if not is_balanced(right_part): | ||
raise ValueError("Query malformed") | ||
break | ||
|
||
if operator == "or": | ||
return OR( | ||
parse_query(left_part, ignore_case, ignore_accent), | ||
parse_query(right_part, ignore_case, ignore_accent) | ||
) | ||
elif operator == "and": | ||
return AND( | ||
parse_query(left_part, ignore_case, ignore_accent), | ||
parse_query(right_part, ignore_case, ignore_accent) | ||
) | ||
elif operator == "and not": | ||
return ANDNOT( | ||
parse_query(left_part, ignore_case, ignore_accent), | ||
parse_query(right_part, ignore_case, ignore_accent) | ||
) | ||
else: | ||
if ignore_case: | ||
query = query.lower() | ||
if ignore_accent: | ||
query = unidecode(query) | ||
return Entry(query) | ||
|
||
|
||
def strip_brackets(query): | ||
count_left = 0 | ||
for i in range(len(query) - 1): | ||
letter = query[i] | ||
if letter == "(": | ||
count_left += 1 | ||
elif letter == ")": | ||
count_left -= 1 | ||
if i > 0 and count_left == 0: | ||
return query | ||
|
||
if query[0] == "(" and query[-1] == ")": | ||
return query[1:-1] | ||
return query | ||
|
||
|
||
def strip_quotes(query): | ||
if query[0] == '"' and query[-1] == '"': | ||
return query[1:-1] | ||
return query | ||
|
||
|
||
def is_balanced(query): | ||
# are brackets balanced | ||
brackets_b = query.count("(") == query.count(")") | ||
quotes_b = query.count('"') % 2 == 0 | ||
return brackets_b and quotes_b | ||
from .query import Query |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import re | ||
from .regex import WILD_CARD_REGEX | ||
|
||
|
||
class Entry: | ||
def __init__(self, query): | ||
self.not_ = False | ||
|
||
if query[:4] == "not ": | ||
self.not_ = True | ||
query = query[4:] | ||
|
||
self.query = strip_quotes(query) | ||
|
||
if "*" in self.query: | ||
self.pattern = self.query.replace("*", WILD_CARD_REGEX) | ||
self.rgx = re.compile(self.pattern) | ||
else: | ||
self.rgx = None | ||
|
||
def evaluate(self, doc): | ||
if self.rgx: | ||
|
||
if isinstance(doc, str): | ||
doc = [doc] | ||
|
||
for item in doc: | ||
if self.rgx.match(item): | ||
res = True | ||
break | ||
else: | ||
res = False | ||
else: | ||
res = self.query in doc | ||
|
||
if self.not_: | ||
return not res | ||
|
||
return res | ||
|
||
def __repr__(self): | ||
if self.not_: | ||
return f'NOT "{self.query}"' | ||
return f'"{self.query}"' | ||
|
||
|
||
def strip_quotes(query): | ||
if query[0] == '"' and query[-1] == '"': | ||
return query[1:-1] | ||
return query |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
|
||
|
||
class Binary: | ||
def __init__(self, left, right): | ||
self.left = left | ||
self.right = right | ||
|
||
|
||
class AND(Binary): | ||
def evaluate(self, doc): | ||
left_match = self.left.evaluate(doc) | ||
if not left_match: | ||
return False | ||
right_match = self.right.evaluate(doc) | ||
if not right_match: | ||
return False | ||
return True | ||
|
||
def __repr__(self): | ||
return f"({self.left}) AND ({self.right})" | ||
|
||
|
||
class ANDNOT(Binary): | ||
def evaluate(self, doc): | ||
left_match = self.left.evaluate(doc) | ||
if not left_match: | ||
return False | ||
right_match = self.right.evaluate(doc) | ||
return not right_match | ||
|
||
def __repr__(self): | ||
return f"({self.left}) AND NOT ({self.right})" | ||
|
||
|
||
class OR(Binary): | ||
def evaluate(self, doc): | ||
if self.left.evaluate(doc): | ||
return True | ||
if self.right.evaluate(doc): | ||
return True | ||
return False | ||
|
||
def __repr__(self): | ||
return f"({self.left}) OR ({self.right})" |
Oops, something went wrong.