From 8af016b7591b2558c305965c468226c0a75bf301 Mon Sep 17 00:00:00 2001 From: Maixent Chenebaux Date: Thu, 8 Sep 2022 11:38:13 +0200 Subject: [PATCH] added Index class --- README.md | 33 +++++++++ eldar/__init__.py | 3 +- eldar/entry.py | 67 +++++++++++++++++ eldar/index.py | 162 ++++++++++++++++++++++++++++++++++++++++++ eldar/indexops.py | 36 ++++++++++ eldar/trie.py | 45 ++++++++++++ setup.py | 5 +- tests/search_index.py | 20 ++++++ 8 files changed, 367 insertions(+), 4 deletions(-) create mode 100644 eldar/index.py create mode 100644 eldar/indexops.py create mode 100644 eldar/trie.py create mode 100644 tests/search_index.py diff --git a/README.md b/README.md index 56c6624..58909fb 100755 --- a/README.md +++ b/README.md @@ -96,6 +96,39 @@ print(eldar(document)) # >>> True ``` +### Building an index for faster queries + +Searching in a large corpus using the Query object is slow, as each document has to be checked. +For (much) faster queries, create an `Index` object, and build it using a list of documents. + +```python +from eldar import Index +from eldar.trie import Trie + +documents = [ + "Gandalf is a fictional character in Tolkien's The Lord of the Rings", + "Frodo is the main character in The Lord of the Rings", + "Ian McKellen interpreted Gandalf in Peter Jackson's movies", + "Elijah Wood was cast as Frodo Baggins in Jackson's adaptation", + "The Lord of the Rings is an epic fantasy novel by J. R. R. Tolkien", + "Frodo Baggins is a hobbit" +] + +index = Index(ignore_case=True, ignore_accent=True) +index.build(documents) # must only be done once + +# persist and retrieve index from disk +index.save("index.p") # but documents are copied to disk +index = Index.load("index.p") + +print(index.search('"frodo b*" AND NOT hobbit')) # support wildcards +print(index.count('"frodo b*" AND NOT hobbit')) # shows only the count +# to only return document ids, set `return_ids` to True: +print(index.search('"frodo b*" AND NOT hobbit', return_ids=True)) +``` + +It works like a usual search engine does: by keeping a dictionary that maps each word to its document ids. The boolean query is turned into an operation tree, where document ids are joined or intersected in order to return the desired matches. + ## License This package is MIT licensed. diff --git a/eldar/__init__.py b/eldar/__init__.py index b716196..9a7d074 100644 --- a/eldar/__init__.py +++ b/eldar/__init__.py @@ -1,3 +1,2 @@ -__version__ = "0.0.6" - from .query import Query +from .index import Index diff --git a/eldar/entry.py b/eldar/entry.py index 0fc40ae..7ab18f3 100644 --- a/eldar/entry.py +++ b/eldar/entry.py @@ -1,4 +1,7 @@ import re +from collections import defaultdict +from dataclasses import dataclass + from .regex import WILD_CARD_REGEX @@ -44,7 +47,71 @@ def __repr__(self): return f'"{self.query}"' +class IndexEntry: + def __init__(self, query_term): + self.not_ = False + + if query_term == "*": + raise ValueError( + "Single character wildcards * are not implemented") + + if " " in query_term: # multiword query + self.query_term = strip_quotes(query_term).split() + self.search = self.search_multiword + else: + self.query_term = query_term + self.search = self.search_simple + + def search_simple(self, index): + res = index.get(self.query_term) + return {match.id for match in res} + + def search_multiword(self, index): + docs = defaultdict(list) + for token in self.query_term: + items = index.get(token) + for item in items: + docs[item.id].append((item.position, token)) + + # utils variable + first_token = self.query_term[0] + query_len = len(self.query_term) + query_rest = self.query_term[1:] + iter_rest = range(1, query_len) + + results = set() + for doc_id, tokens in docs.items(): + tokens = sorted(tokens) + if len(tokens) < query_len: + continue + for i in range(len(tokens) - query_len + 1): + pos, tok = tokens[i] + if tok != first_token: + continue + is_a_match = True + for j, correct_token in zip(iter_rest, query_rest): + next_pos, next_tok = tokens[i + j] + if correct_token != next_tok or next_pos != pos + j: + is_a_match = False + break + if is_a_match: + results.add(doc_id) + break + return results + + def __repr__(self): + if self.not_: + return f'NOT "{self.query_term}"' + return f'"{self.query_term}"' + + def strip_quotes(query): if query[0] == '"' and query[-1] == '"': return query[1:-1] return query + + +@dataclass(unsafe_hash=True, order=True) +class Item: + id: int + position: int diff --git a/eldar/index.py b/eldar/index.py new file mode 100644 index 0000000..93501b4 --- /dev/null +++ b/eldar/index.py @@ -0,0 +1,162 @@ +import re +from collections import defaultdict + +from unidecode import unidecode + +from .entry import IndexEntry, Item +from .query import is_balanced, strip_brackets +from .regex import WORD_REGEX + + +class Index: + def __init__( + self, + ignore_case=True, + ignore_accent=True, + use_trie=True + ): + self.ignore_case = ignore_case + self.ignore_accent = ignore_accent + self.use_trie = use_trie + self._index = defaultdict(set) + + def get(self, query_term): + if query_term == "*": + raise ValueError( + "Single character wildcards * are not implemented") + + if "*" not in query_term: + res = self._index.get(query_term, set()) + if not isinstance(res, set): + res = set(res) + return res + else: + query_regex = query_term.replace("*", ".*") + if self.use_trie: + matches = self._trie.get(query_term) + matches = [ + token for token in matches + if re.match(query_regex, token) is not None + ] + else: + matches = [ + token for token in self._index + if re.match(query_regex, token) is not None + ] + results = set() + for match in matches: + res = self._index[match] + if not isinstance(res, set): + res = set(res) + results.update(res) + return results + + def build(self, documents, verbose=False): + self.documents = documents + if verbose: + from tqdm import tqdm + iteration = tqdm(enumerate(documents), total=len(documents)) + else: + iteration = enumerate(documents) + for i, document in iteration: + tokens = self.preprocess(document) + for j, token in enumerate(tokens): + self._index[token].add(Item(i, j)) + + if self.use_trie: + from .trie import Trie + self._trie = Trie() + self._trie.add_tokens(self._index.keys()) + + def preprocess(self, doc): + if self.ignore_case: + doc = doc.lower() + if self.ignore_accent: + doc = unidecode(doc) + doc = re.findall(WORD_REGEX, doc, re.UNICODE) + return doc + + def search(self, query, return_ids=False): + query = parse_query(query, + ignore_case=self.ignore_case, + ignore_accent=self.ignore_accent) + ids = query.search(self) + if return_ids: + return ids + + return [self.documents[i] for i in ids] + + def count(self, query): + return len(self.search(query, return_ids=True)) + + def save(self, filename): + import pickle + with open(filename, "wb") as f: + pickle.dump(self, f) + + @staticmethod + def load(filename): + import pickle + with open(filename, "rb") as f: + index = pickle.load(f) + return index + + +def parse_query(query, ignore_case=True, ignore_accent=True): + from .indexops import AND, ANDNOT, OR + + # remove brackets around query + if query[0] == '(' and query[-1] == ')': + query = strip_brackets(query) + # if there are quotes around query, make an entry + if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: + if ignore_case: + query = query.lower() + if ignore_accent: + query = unidecode(query) + return IndexEntry(query) + + # find all operators + match = [] + match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) + for m in match_iter: + start = m.start(0) + end = m.end(0) + operator = query[start+1:end-1].lower() + match_item = (start, end) + match.append((operator, match_item)) + match_len = len(match) + + if match_len != 0: + # stop at first balanced operation + for i, (operator, (start, end)) in enumerate(match): + left_part = query[:start] + if not is_balanced(left_part): + continue + + right_part = query[end:] + if not is_balanced(right_part): + raise ValueError("Query malformed") + break + + if operator == "or": + return OR( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + elif operator == "and": + return AND( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + elif operator == "and not": + return ANDNOT( + parse_query(left_part, ignore_case, ignore_accent), + parse_query(right_part, ignore_case, ignore_accent) + ) + else: + if ignore_case: + query = query.lower() + if ignore_accent: + query = unidecode(query) + return IndexEntry(query) diff --git a/eldar/indexops.py b/eldar/indexops.py new file mode 100644 index 0000000..253aa22 --- /dev/null +++ b/eldar/indexops.py @@ -0,0 +1,36 @@ + + +class Binary: + def __init__(self, left, right): + self.left = left + self.right = right + + +class AND(Binary): + def search(self, index): + left_match = self.left.search(index) + right_match = self.right.search(index) + return left_match.intersection(right_match) + + def __repr__(self): + return f"({self.left}) AND ({self.right})" + + +class ANDNOT(Binary): + def search(self, index): + left_match = self.left.search(index) + right_match = self.right.search(index) + return left_match.difference(right_match) + + def __repr__(self): + return f"({self.left}) AND NOT ({self.right})" + + +class OR(Binary): + def search(self, index): + left_match = self.left.search(index) + right_match = self.right.search(index) + return left_match.union(right_match) + + def __repr__(self): + return f"({self.left}) OR ({self.right})" diff --git a/eldar/trie.py b/eldar/trie.py new file mode 100644 index 0000000..508a2ff --- /dev/null +++ b/eldar/trie.py @@ -0,0 +1,45 @@ + + +class Trie: + def __init__(self): + self.trie = {} + + def add_tokens(self, tokens): + for token in tokens: + self.add_token(token) + + def add_token(self, token): + leaf = self.trie + for char in token: + if char in leaf: + leaf = leaf[char] + else: + leaf[char] = {} + leaf = leaf[char] + if 1 in leaf: + return + leaf[1] = 1 # the word exist + + def get(self, token): + leaf = self.trie + current_str = "" + for char in token: + if char != "*": + if char not in leaf: + return [] + leaf = leaf[char] + current_str += char + else: + return self.dfs(current_str, leaf) + if 1 in leaf: + return [token] + return [] + + def dfs(self, current_str, leaf): + res = [] + for key in leaf: + if key == 1: + res.append(current_str) + else: + res.extend(self.dfs(current_str+key, leaf[key])) + return res diff --git a/setup.py b/setup.py index 53c9784..947c7f1 100755 --- a/setup.py +++ b/setup.py @@ -1,13 +1,14 @@ -from setuptools import setup, find_packages import pathlib +from setuptools import find_packages, setup + # The directory containing this file HERE = pathlib.Path(__file__).parent README = (HERE / "README.md").read_text() setup( name="eldar", - version="0.0.7", + version="0.0.8", author="Maixent Chenebaux", author_email="max.chbx@gmail.com", description="Boolean text search in Python", diff --git a/tests/search_index.py b/tests/search_index.py new file mode 100644 index 0000000..5adb955 --- /dev/null +++ b/tests/search_index.py @@ -0,0 +1,20 @@ +from eldar import Index +from eldar.trie import Trie + +documents = [ + "Gandalf is a fictional character in Tolkien's The Lord of the Rings", + "Frodo is the main character in The Lord of the Rings", + "Ian McKellen interpreted Gandalf in Peter Jackson's movies", + "Elijah Wood was cast as Frodo Baggins in Jackson's adaptation", + "The Lord of the Rings is an epic fantasy novel by J. R. R. Tolkien", + "Frodo Baggins is a hobbit" +] + +index = Index() +index.build(documents) # no copy is made + +# index.save("index.p") # but documents are copied to disk +# index = Index.load("index.p") + +# support for wildcard +print(index.search('"frodo b*" AND NOT hobbit'))