added MIT License

kerighan · Mar 8, 2022 · 9a410ee · 9a410ee
2 parents f506423 + 436277f
commit 9a410ee
Show file tree

Hide file tree

Showing 8 changed files with 247 additions and 210 deletions.
diff --git a/README.md b/README.md
@@ -79,10 +79,23 @@ Let the query be ```query = '"movie"'```:
 * If `ignore_accent` is True, the documents "mövie" will be matched.
 * If `match_word` is True, the document will be tokenized and the query terms will have to match exactly. If set to False, the documents "movies" and "movie" will be matched. Setting this option to True may slow down the query.
 
-## License
+### Wildcards
 
-This package is MIT licensed.
+Queries also support `*` as wildcard character. Wildcard matches any number (including none) of alphanumeric characters.
 
-## Authors
+```python
+from eldar import Query
 
-Maixent Chenebaux
+
+# sample document and query with multiple wildcards:
+document = "Gandalf is a fictional character in Tolkien's The Lord of the Rings"
+eldar = Query('"g*dal*"')
+
+# call to see if the text matches the query:
+print(eldar(document))
+# >>> True
+```
+
+## License
+
+This package is MIT licensed.
diff --git a/eldar/__init__.py b/eldar/__init__.py
@@ -1,205 +1,3 @@
-from unidecode import unidecode
-import re
+__version__ = "0.0.6"
 
-
-__version__ = "0.0.5"
-
-
-word_regex = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])'
-
-
-class Query:
-    def __init__(
-        self,
-        query,
-        ignore_case=True,
-        ignore_accent=True,
-        match_word=True
-    ):
-        self.ignore_case = ignore_case
-        self.ignore_accent = ignore_accent
-        self.match_word = match_word
-        self.query = parse_query(query, ignore_case, ignore_accent)
-
-    def preprocess(self, doc):
-        if self.ignore_case:
-            doc = doc.lower()
-        if self.ignore_accent:
-            doc = unidecode(doc)
-        if self.match_word:
-            doc = set(re.findall(word_regex, doc, re.UNICODE))
-        return doc
-
-    def evaluate(self, doc):
-        doc = self.preprocess(doc)
-        return self.query.evaluate(doc)
-
-    def filter(self, documents):
-        docs = []
-        for doc in documents:
-            if not self.evaluate(doc):
-                continue
-            docs.append(doc)
-        return docs
-
-    def __call__(self, doc):
-        return self.evaluate(doc)
-
-    def __repr__(self):
-        return self.query.__repr__()
-
-
-class Binary:
-    def __init__(self, left, right):
-        self.left = left
-        self.right = right
-
-
-class AND(Binary):
-    def evaluate(self, doc):
-        left_match = self.left.evaluate(doc)
-        if not left_match:
-            return False
-        right_match = self.right.evaluate(doc)
-        if not right_match:
-            return False
-        return True
-
-    def __repr__(self):
-        return f"({self.left}) AND ({self.right})"
-
-
-class ANDNOT(Binary):
-    def evaluate(self, doc):
-        left_match = self.left.evaluate(doc)
-        if not left_match:
-            return False
-        right_match = self.right.evaluate(doc)
-        return not right_match
-
-    def __repr__(self):
-        return f"({self.left}) AND NOT ({self.right})"
-
-
-# class NOT(Binary):
-#     def evaluate(self, doc):
-
-
-
-class OR(Binary):
-    def evaluate(self, doc):
-        if self.left.evaluate(doc):
-            return True
-        if self.right.evaluate(doc):
-            return True
-        return False
-
-    def __repr__(self):
-        return f"({self.left}) OR ({self.right})"
-
-
-class Entry:
-    def __init__(self, query):
-        self.not_ = False
-        if query[:4] == "not ":
-            self.not_ = True
-            query = query[4:]
-        self.query = strip_quotes(query)
-
-    def evaluate(self, doc):
-        res = self.query in doc
-        if self.not_:
-            return not res
-        return res
-
-    def __repr__(self):
-        if self.not_:
-            return f'NOT "{self.query}"'
-        return f'"{self.query}"'
-
-
-def parse_query(query, ignore_case=True, ignore_accent=True):
-    # remove brackets around query
-    if query[0] == '(' and query[-1] == ')':
-        query = strip_brackets(query)
-    # if there are quotes around query, make an entry
-    if query[0] == '"' and query[-1] == '"' and query.count('"') == 1:
-        if ignore_case:
-            query = query.lower()
-        if ignore_accent:
-            query = unidecode(query)
-        return Entry(query)
-
-    # find all operators
-    match = []
-    match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE)
-    for m in match_iter:
-        start = m.start(0)
-        end = m.end(0)
-        operator = query[start+1:end-1].lower()
-        match_item = (start, end)
-        match.append((operator, match_item))
-    match_len = len(match)
-
-    if match_len != 0:
-        # stop at first balanced operation
-        for i, (operator, (start, end)) in enumerate(match):
-            left_part = query[:start]
-            if not is_balanced(left_part):
-                continue
-
-            right_part = query[end:]
-            if not is_balanced(right_part):
-                raise ValueError("Query malformed")
-            break
-
-        if operator == "or":
-            return OR(
-                parse_query(left_part, ignore_case, ignore_accent),
-                parse_query(right_part, ignore_case, ignore_accent)
-            )
-        elif operator == "and":
-            return AND(
-                parse_query(left_part, ignore_case, ignore_accent),
-                parse_query(right_part, ignore_case, ignore_accent)
-            )
-        elif operator == "and not":
-            return ANDNOT(
-                parse_query(left_part, ignore_case, ignore_accent),
-                parse_query(right_part, ignore_case, ignore_accent)
-            )
-    else:
-        if ignore_case:
-            query = query.lower()
-        if ignore_accent:
-            query = unidecode(query)
-        return Entry(query)
-
-
-def strip_brackets(query):
-    count_left = 0
-    for i in range(len(query) - 1):
-        letter = query[i]
-        if letter == "(":
-            count_left += 1
-        elif letter == ")":
-            count_left -= 1
-        if i > 0 and count_left == 0:
-            return query
-
-    if query[0] == "(" and query[-1] == ")":
-        return query[1:-1]
-    return query
-
-
-def strip_quotes(query):
-    if query[0] == '"' and query[-1] == '"':
-        return query[1:-1]
-    return query
-
-
-def is_balanced(query):
-    # are brackets balanced
-    brackets_b = query.count("(") == query.count(")")
-    quotes_b = query.count('"') % 2 == 0
-    return brackets_b and quotes_b
+from .query import Query
diff --git a/eldar/entry.py b/eldar/entry.py
@@ -0,0 +1,50 @@
+import re
+from .regex import WILD_CARD_REGEX
+
+
+class Entry:
+    def __init__(self, query):
+        self.not_ = False
+
+        if query[:4] == "not ":
+            self.not_ = True
+            query = query[4:]
+
+        self.query = strip_quotes(query)
+
+        if "*" in self.query:
+            self.pattern = self.query.replace("*", WILD_CARD_REGEX)
+            self.rgx = re.compile(self.pattern)
+        else:
+            self.rgx = None
+
+    def evaluate(self, doc):
+        if self.rgx:
+
+            if isinstance(doc, str):
+                doc = [doc]
+
+            for item in doc:
+                if self.rgx.match(item):
+                    res = True
+                    break
+            else:
+                res = False
+        else:
+            res = self.query in doc
+
+        if self.not_:
+            return not res
+
+        return res
+
+    def __repr__(self):
+        if self.not_:
+            return f'NOT "{self.query}"'
+        return f'"{self.query}"'
+
+
+def strip_quotes(query):
+    if query[0] == '"' and query[-1] == '"':
+        return query[1:-1]
+    return query
diff --git a/eldar/operators.py b/eldar/operators.py
@@ -0,0 +1,44 @@
+
+
+class Binary:
+    def __init__(self, left, right):
+        self.left = left
+        self.right = right
+
+
+class AND(Binary):
+    def evaluate(self, doc):
+        left_match = self.left.evaluate(doc)
+        if not left_match:
+            return False
+        right_match = self.right.evaluate(doc)
+        if not right_match:
+            return False
+        return True
+
+    def __repr__(self):
+        return f"({self.left}) AND ({self.right})"
+
+
+class ANDNOT(Binary):
+    def evaluate(self, doc):
+        left_match = self.left.evaluate(doc)
+        if not left_match:
+            return False
+        right_match = self.right.evaluate(doc)
+        return not right_match
+
+    def __repr__(self):
+        return f"({self.left}) AND NOT ({self.right})"
+
+
+class OR(Binary):
+    def evaluate(self, doc):
+        if self.left.evaluate(doc):
+            return True
+        if self.right.evaluate(doc):
+            return True
+        return False
+
+    def __repr__(self):
+        return f"({self.left}) OR ({self.right})"