diff --git a/README.md b/README.md index 5330e99..56c6624 100755 --- a/README.md +++ b/README.md @@ -79,10 +79,23 @@ Let the query be ```query = '"movie"'```: * If `ignore_accent` is True, the documents "mövie" will be matched. * If `match_word` is True, the document will be tokenized and the query terms will have to match exactly. If set to False, the documents "movies" and "movie" will be matched. Setting this option to True may slow down the query. -## License +### Wildcards -This package is MIT licensed. +Queries also support `*` as wildcard character. Wildcard matches any number (including none) of alphanumeric characters. -## Authors +```python +from eldar import Query -Maixent Chenebaux \ No newline at end of file + +# sample document and query with multiple wildcards: +document = "Gandalf is a fictional character in Tolkien's The Lord of the Rings" +eldar = Query('"g*dal*"') + +# call to see if the text matches the query: +print(eldar(document)) +# >>> True +``` + +## License + +This package is MIT licensed. diff --git a/eldar/__init__.py b/eldar/__init__.py index 437a70a..b716196 100644 --- a/eldar/__init__.py +++ b/eldar/__init__.py @@ -1,205 +1,3 @@ -from unidecode import unidecode -import re +__version__ = "0.0.6" - -__version__ = "0.0.5" - - -word_regex = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])' - - -class Query: - def __init__( - self, - query, - ignore_case=True, - ignore_accent=True, - match_word=True - ): - self.ignore_case = ignore_case - self.ignore_accent = ignore_accent - self.match_word = match_word - self.query = parse_query(query, ignore_case, ignore_accent) - - def preprocess(self, doc): - if self.ignore_case: - doc = doc.lower() - if self.ignore_accent: - doc = unidecode(doc) - if self.match_word: - doc = set(re.findall(word_regex, doc, re.UNICODE)) - return doc - - def evaluate(self, doc): - doc = self.preprocess(doc) - return self.query.evaluate(doc) - - def filter(self, documents): - docs = [] - for doc in documents: - if not self.evaluate(doc): - continue - docs.append(doc) - return docs - - def __call__(self, doc): - return self.evaluate(doc) - - def __repr__(self): - return self.query.__repr__() - - -class Binary: - def __init__(self, left, right): - self.left = left - self.right = right - - -class AND(Binary): - def evaluate(self, doc): - left_match = self.left.evaluate(doc) - if not left_match: - return False - right_match = self.right.evaluate(doc) - if not right_match: - return False - return True - - def __repr__(self): - return f"({self.left}) AND ({self.right})" - - -class ANDNOT(Binary): - def evaluate(self, doc): - left_match = self.left.evaluate(doc) - if not left_match: - return False - right_match = self.right.evaluate(doc) - return not right_match - - def __repr__(self): - return f"({self.left}) AND NOT ({self.right})" - - -# class NOT(Binary): -# def evaluate(self, doc): - - - -class OR(Binary): - def evaluate(self, doc): - if self.left.evaluate(doc): - return True - if self.right.evaluate(doc): - return True - return False - - def __repr__(self): - return f"({self.left}) OR ({self.right})" - - -class Entry: - def __init__(self, query): - self.not_ = False - if query[:4] == "not ": - self.not_ = True - query = query[4:] - self.query = strip_quotes(query) - - def evaluate(self, doc): - res = self.query in doc - if self.not_: - return not res - return res - - def __repr__(self): - if self.not_: - return f'NOT "{self.query}"' - return f'"{self.query}"' - - -def parse_query(query, ignore_case=True, ignore_accent=True): - # remove brackets around query - if query[0] == '(' and query[-1] == ')': - query = strip_brackets(query) - # if there are quotes around query, make an entry - if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: - if ignore_case: - query = query.lower() - if ignore_accent: - query = unidecode(query) - return Entry(query) - - # find all operators - match = [] - match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) - for m in match_iter: - start = m.start(0) - end = m.end(0) - operator = query[start+1:end-1].lower() - match_item = (start, end) - match.append((operator, match_item)) - match_len = len(match) - - if match_len != 0: - # stop at first balanced operation - for i, (operator, (start, end)) in enumerate(match): - left_part = query[:start] - if not is_balanced(left_part): - continue - - right_part = query[end:] - if not is_balanced(right_part): - raise ValueError("Query malformed") - break - - if operator == "or": - return OR( - parse_query(left_part, ignore_case, ignore_accent), - parse_query(right_part, ignore_case, ignore_accent) - ) - elif operator == "and": - return AND( - parse_query(left_part, ignore_case, ignore_accent), - parse_query(right_part, ignore_case, ignore_accent) - ) - elif operator == "and not": - return ANDNOT( - parse_query(left_part, ignore_case, ignore_accent), - parse_query(right_part, ignore_case, ignore_accent) - ) - else: - if ignore_case: - query = query.lower() - if ignore_accent: - query = unidecode(query) - return Entry(query) - - -def strip_brackets(query): - count_left = 0 - for i in range(len(query) - 1): - letter = query[i] - if letter == "(": - count_left += 1 - elif letter == ")": - count_left -= 1 - if i > 0 and count_left == 0: - return query - - if query[0] == "(" and query[-1] == ")": - return query[1:-1] - return query - - -def strip_quotes(query): - if query[0] == '"' and query[-1] == '"': - return query[1:-1] - return query - - -def is_balanced(query): - # are brackets balanced - brackets_b = query.count("(") == query.count(")") - quotes_b = query.count('"') % 2 == 0 - return brackets_b and quotes_b +from .query import Query diff --git a/eldar/entry.py b/eldar/entry.py new file mode 100644 index 0000000..0fc40ae --- /dev/null +++ b/eldar/entry.py @@ -0,0 +1,50 @@ +import re +from .regex import WILD_CARD_REGEX + + +class Entry: + def __init__(self, query): + self.not_ = False + + if query[:4] == "not ": + self.not_ = True + query = query[4:] + + self.query = strip_quotes(query) + + if "*" in self.query: + self.pattern = self.query.replace("*", WILD_CARD_REGEX) + self.rgx = re.compile(self.pattern) + else: + self.rgx = None + + def evaluate(self, doc): + if self.rgx: + + if isinstance(doc, str): + doc = [doc] + + for item in doc: + if self.rgx.match(item): + res = True + break + else: + res = False + else: + res = self.query in doc + + if self.not_: + return not res + + return res + + def __repr__(self): + if self.not_: + return f'NOT "{self.query}"' + return f'"{self.query}"' + + +def strip_quotes(query): + if query[0] == '"' and query[-1] == '"': + return query[1:-1] + return query diff --git a/eldar/operators.py b/eldar/operators.py new file mode 100644 index 0000000..2edf927 --- /dev/null +++ b/eldar/operators.py @@ -0,0 +1,44 @@ + + +class Binary: + def __init__(self, left, right): + self.left = left + self.right = right + + +class AND(Binary): + def evaluate(self, doc): + left_match = self.left.evaluate(doc) + if not left_match: + return False + right_match = self.right.evaluate(doc) + if not right_match: + return False + return True + + def __repr__(self): + return f"({self.left}) AND ({self.right})" + + +class ANDNOT(Binary): + def evaluate(self, doc): + left_match = self.left.evaluate(doc) + if not left_match: + return False + right_match = self.right.evaluate(doc) + return not right_match + + def __repr__(self): + return f"({self.left}) AND NOT ({self.right})" + + +class OR(Binary): + def evaluate(self, doc): + if self.left.evaluate(doc): + return True + if self.right.evaluate(doc): + return True + return False + + def __repr__(self): + return f"({self.left}) OR ({self.right})" diff --git a/eldar/query.py b/eldar/query.py new file mode 100644 index 0000000..65109d0 --- /dev/null +++ b/eldar/query.py @@ -0,0 +1,127 @@ +from unidecode import unidecode +import re +from .regex import WORD_REGEX +from .entry import Entry +from .operators import AND, ANDNOT, OR + + +class Query: + def __init__( + self, + query, + ignore_case=True, + ignore_accent=True, + match_word=True + ): + self.ignore_case = ignore_case + self.ignore_accent = ignore_accent + self.match_word = match_word + self.query = parse_query(query, ignore_case, ignore_accent) + + def preprocess(self, doc): + if self.ignore_case: + doc = doc.lower() + if self.ignore_accent: + doc = unidecode(doc) + if self.match_word: + doc = set(re.findall(WORD_REGEX, doc, re.UNICODE)) + return doc + + def evaluate(self, doc): + doc = self.preprocess(doc) + return self.query.evaluate(doc) + + def filter(self, documents): + docs = [] + for doc in documents: + if not self.evaluate(doc): + continue + docs.append(doc) + return docs + + def __call__(self, doc): + return self.evaluate(doc) + + def __repr__(self): + return self.query.__repr__() + + +def parse_query(query, ignore_case=True, ignore_accent=True): + # remove brackets around query + if query[0] == '(' and query[-1] == ')': + query = strip_brackets(query) + # if there are quotes around query, make an entry + if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: + if ignore_case: + query = query.lower() + if ignore_accent: + query = unidecode(query) + return Entry(query) + + # find all operators + match = [] + match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) + for m in match_iter: + start = m.start(0) + end = m.end(0) + operator = query[start+1:end-1].lower() + match_item = (start, end) + match.append((operator, match_item)) + match_len = len(match) + + if match_len != 0: + # stop at first balanced operation + for i, (operator, (start, end)) in enumerate(match): + left_part = query[:start] + if not is_balanced(left_part): + continue + + right_part = query[end:] + if not is_balanced(right_part): + raise ValueError("Query malformed") + break + + if operator == "or": + return OR( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + elif operator == "and": + return AND( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + elif operator == "and not": + return ANDNOT( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + else: + if ignore_case: + query = query.lower() + if ignore_accent: + query = unidecode(query) + return Entry(query) + + +def strip_brackets(query): + count_left = 0 + for i in range(len(query) - 1): + letter = query[i] + if letter == "(": + count_left += 1 + elif letter == ")": + count_left -= 1 + if i > 0 and count_left == 0: + return query + + if query[0] == "(" and query[-1] == ")": + return query[1:-1] + return query + + +def is_balanced(query): + # are brackets balanced + brackets_b = query.count("(") == query.count(")") + quotes_b = query.count('"') % 2 == 0 + return brackets_b and quotes_b diff --git a/eldar/regex.py b/eldar/regex.py new file mode 100644 index 0000000..ecfa547 --- /dev/null +++ b/eldar/regex.py @@ -0,0 +1,2 @@ +WORD_REGEX = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])' +WILD_CARD_REGEX = r'\w*' diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6ec897a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +unidecode~=1.2.0 +pandas~=1.1.3 +setuptools~=50.3.0 diff --git a/setup.py b/setup.py index 0993d2a..53c9784 100755 --- a/setup.py +++ b/setup.py @@ -7,9 +7,9 @@ setup( name="eldar", - version="0.0.6", + version="0.0.7", author="Maixent Chenebaux", - author_email="mchenebaux@reputationsquad.com", + author_email="max.chbx@gmail.com", description="Boolean text search in Python", long_description=README, long_description_content_type="text/markdown",