added Index class

kerighan · Sep 8, 2022 · 8af016b · 8af016b
1 parent 9a410ee
commit 8af016b
Show file tree

Hide file tree

Showing 8 changed files with 367 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -96,6 +96,39 @@ print(eldar(document))
 # >>> True
 ```
 
+### Building an index for faster queries
+
+Searching in a large corpus using the Query object is slow, as each document has to be checked.
+For (much) faster queries, create an `Index` object, and build it using a list of documents.
+
+```python
+from eldar import Index
+from eldar.trie import Trie
+
+documents = [
+    "Gandalf is a fictional character in Tolkien's The Lord of the Rings",
+    "Frodo is the main character in The Lord of the Rings",
+    "Ian McKellen interpreted Gandalf in Peter Jackson's movies",
+    "Elijah Wood was cast as Frodo Baggins in Jackson's adaptation",
+    "The Lord of the Rings is an epic fantasy novel by J. R. R. Tolkien",
+    "Frodo Baggins is a hobbit"
+]
+
+index = Index(ignore_case=True, ignore_accent=True)
+index.build(documents)  # must only be done once
+
+# persist and retrieve index from disk
+index.save("index.p")  # but documents are copied to disk
+index = Index.load("index.p")
+
+print(index.search('"frodo b*" AND NOT hobbit'))  # support wildcards
+print(index.count('"frodo b*" AND NOT hobbit'))  # shows only the count
+# to only return document ids, set `return_ids` to True:
+print(index.search('"frodo b*" AND NOT hobbit', return_ids=True))
+```
+
+It works like a usual search engine does: by keeping a dictionary that maps each word to its document ids. The boolean query is turned into an operation tree, where document ids are joined or intersected in order to return the desired matches.
+
 ## License
 
 This package is MIT licensed.
diff --git a/eldar/__init__.py b/eldar/__init__.py
@@ -1,3 +1,2 @@
-__version__ = "0.0.6"
-
 from .query import Query
+from .index import Index
diff --git a/eldar/entry.py b/eldar/entry.py
@@ -1,4 +1,7 @@
 import re
+from collections import defaultdict
+from dataclasses import dataclass
+
 from .regex import WILD_CARD_REGEX
 
 
@@ -44,7 +47,71 @@ def __repr__(self):
         return f'"{self.query}"'
 
 
+class IndexEntry:
+    def __init__(self, query_term):
+        self.not_ = False
+
+        if query_term == "*":
+            raise ValueError(
+                "Single character wildcards * are not implemented")
+
+        if " " in query_term:  # multiword query
+            self.query_term = strip_quotes(query_term).split()
+            self.search = self.search_multiword
+        else:
+            self.query_term = query_term
+            self.search = self.search_simple
+
+    def search_simple(self, index):
+        res = index.get(self.query_term)
+        return {match.id for match in res}
+
+    def search_multiword(self, index):
+        docs = defaultdict(list)
+        for token in self.query_term:
+            items = index.get(token)
+            for item in items:
+                docs[item.id].append((item.position, token))
+
+        # utils variable
+        first_token = self.query_term[0]
+        query_len = len(self.query_term)
+        query_rest = self.query_term[1:]
+        iter_rest = range(1, query_len)
+
+        results = set()
+        for doc_id, tokens in docs.items():
+            tokens = sorted(tokens)
+            if len(tokens) < query_len:
+                continue
+            for i in range(len(tokens) - query_len + 1):
+                pos, tok = tokens[i]
+                if tok != first_token:
+                    continue
+                is_a_match = True
+                for j, correct_token in zip(iter_rest, query_rest):
+                    next_pos, next_tok = tokens[i + j]
+                    if correct_token != next_tok or next_pos != pos + j:
+                        is_a_match = False
+                        break
+                if is_a_match:
+                    results.add(doc_id)
+                    break
+        return results
+
+    def __repr__(self):
+        if self.not_:
+            return f'NOT "{self.query_term}"'
+        return f'"{self.query_term}"'
+
+
 def strip_quotes(query):
     if query[0] == '"' and query[-1] == '"':
         return query[1:-1]
     return query
+
+
+@dataclass(unsafe_hash=True, order=True)
+class Item:
+    id: int
+    position: int
diff --git a/eldar/index.py b/eldar/index.py
@@ -0,0 +1,162 @@
+import re
+from collections import defaultdict
+
+from unidecode import unidecode
+
+from .entry import IndexEntry, Item
+from .query import is_balanced, strip_brackets
+from .regex import WORD_REGEX
+
+
+class Index:
+    def __init__(
+        self,
+        ignore_case=True,
+        ignore_accent=True,
+        use_trie=True
+    ):
+        self.ignore_case = ignore_case
+        self.ignore_accent = ignore_accent
+        self.use_trie = use_trie
+        self._index = defaultdict(set)
+
+    def get(self, query_term):
+        if query_term == "*":
+            raise ValueError(
+                "Single character wildcards * are not implemented")
+
+        if "*" not in query_term:
+            res = self._index.get(query_term, set())
+            if not isinstance(res, set):
+                res = set(res)
+            return res
+        else:
+            query_regex = query_term.replace("*", ".*")
+            if self.use_trie:
+                matches = self._trie.get(query_term)
+                matches = [
+                    token for token in matches
+                    if re.match(query_regex, token) is not None
+                ]
+            else:
+                matches = [
+                    token for token in self._index
+                    if re.match(query_regex, token) is not None
+                ]
+            results = set()
+            for match in matches:
+                res = self._index[match]
+                if not isinstance(res, set):
+                    res = set(res)
+                results.update(res)
+            return results
+
+    def build(self, documents, verbose=False):
+        self.documents = documents
+        if verbose:
+            from tqdm import tqdm
+            iteration = tqdm(enumerate(documents), total=len(documents))
+        else:
+            iteration = enumerate(documents)
+        for i, document in iteration:
+            tokens = self.preprocess(document)
+            for j, token in enumerate(tokens):
+                self._index[token].add(Item(i, j))
+
+        if self.use_trie:
+            from .trie import Trie
+            self._trie = Trie()
+            self._trie.add_tokens(self._index.keys())
+
+    def preprocess(self, doc):
+        if self.ignore_case:
+            doc = doc.lower()
+        if self.ignore_accent:
+            doc = unidecode(doc)
+        doc = re.findall(WORD_REGEX, doc, re.UNICODE)
+        return doc
+
+    def search(self, query, return_ids=False):
+        query = parse_query(query,
+                            ignore_case=self.ignore_case,
+                            ignore_accent=self.ignore_accent)
+        ids = query.search(self)
+        if return_ids:
+            return ids
+
+        return [self.documents[i] for i in ids]
+
+    def count(self, query):
+        return len(self.search(query, return_ids=True))
+
+    def save(self, filename):
+        import pickle
+        with open(filename, "wb") as f:
+            pickle.dump(self, f)
+
+    @staticmethod
+    def load(filename):
+        import pickle
+        with open(filename, "rb") as f:
+            index = pickle.load(f)
+        return index
+
+
+def parse_query(query, ignore_case=True, ignore_accent=True):
+    from .indexops import AND, ANDNOT, OR
+
+    # remove brackets around query
+    if query[0] == '(' and query[-1] == ')':
+        query = strip_brackets(query)
+    # if there are quotes around query, make an entry
+    if query[0] == '"' and query[-1] == '"' and query.count('"') == 1:
+        if ignore_case:
+            query = query.lower()
+        if ignore_accent:
+            query = unidecode(query)
+        return IndexEntry(query)
+
+    # find all operators
+    match = []
+    match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE)
+    for m in match_iter:
+        start = m.start(0)
+        end = m.end(0)
+        operator = query[start+1:end-1].lower()
+        match_item = (start, end)
+        match.append((operator, match_item))
+    match_len = len(match)
+
+    if match_len != 0:
+        # stop at first balanced operation
+        for i, (operator, (start, end)) in enumerate(match):
+            left_part = query[:start]
+            if not is_balanced(left_part):
+                continue
+
+            right_part = query[end:]
+            if not is_balanced(right_part):
+                raise ValueError("Query malformed")
+            break
+
+        if operator == "or":
+            return OR(
+                parse_query(left_part, ignore_case, ignore_accent),
+                parse_query(right_part, ignore_case, ignore_accent)
+            )
+        elif operator == "and":
+            return AND(
+                parse_query(left_part, ignore_case, ignore_accent),
+                parse_query(right_part, ignore_case, ignore_accent)
+            )
+        elif operator == "and not":
+            return ANDNOT(
+                parse_query(left_part, ignore_case, ignore_accent),
+                parse_query(right_part, ignore_case, ignore_accent)
+            )
+    else:
+        if ignore_case:
+            query = query.lower()
+        if ignore_accent:
+            query = unidecode(query)
+        return IndexEntry(query)
diff --git a/eldar/indexops.py b/eldar/indexops.py
@@ -0,0 +1,36 @@
+
+
+class Binary:
+    def __init__(self, left, right):
+        self.left = left
+        self.right = right
+
+
+class AND(Binary):
+    def search(self, index):
+        left_match = self.left.search(index)
+        right_match = self.right.search(index)
+        return left_match.intersection(right_match)
+
+    def __repr__(self):
+        return f"({self.left}) AND ({self.right})"
+
+
+class ANDNOT(Binary):
+    def search(self, index):
+        left_match = self.left.search(index)
+        right_match = self.right.search(index)
+        return left_match.difference(right_match)
+
+    def __repr__(self):
+        return f"({self.left}) AND NOT ({self.right})"
+
+
+class OR(Binary):
+    def search(self, index):
+        left_match = self.left.search(index)
+        right_match = self.right.search(index)
+        return left_match.union(right_match)
+
+    def __repr__(self):
+        return f"({self.left}) OR ({self.right})"
diff --git a/eldar/trie.py b/eldar/trie.py
@@ -0,0 +1,45 @@
+
+
+class Trie:
+    def __init__(self):
+        self.trie = {}
+
+    def add_tokens(self, tokens):
+        for token in tokens:
+            self.add_token(token)
+
+    def add_token(self, token):
+        leaf = self.trie
+        for char in token:
+            if char in leaf:
+                leaf = leaf[char]
+            else:
+                leaf[char] = {}
+                leaf = leaf[char]
+        if 1 in leaf:
+            return
+        leaf[1] = 1  # the word exist
+
+    def get(self, token):
+        leaf = self.trie
+        current_str = ""
+        for char in token:
+            if char != "*":
+                if char not in leaf:
+                    return []
+                leaf = leaf[char]
+                current_str += char
+            else:
+                return self.dfs(current_str, leaf)
+        if 1 in leaf:
+            return [token]
+        return []
+
+    def dfs(self, current_str, leaf):
+        res = []
+        for key in leaf:
+            if key == 1:
+                res.append(current_str)
+            else:
+                res.extend(self.dfs(current_str+key, leaf[key]))
+        return res
diff --git a/setup.py b/setup.py
@@ -1,13 +1,14 @@
-from setuptools import setup, find_packages
 import pathlib
 
+from setuptools import find_packages, setup
+
 # The directory containing this file
 HERE = pathlib.Path(__file__).parent
 README = (HERE / "README.md").read_text()
 
 setup(
     name="eldar",
-    version="0.0.7",
+    version="0.0.8",
     author="Maixent Chenebaux",
     author_email="[email protected]",
     description="Boolean text search in Python",