Skip to content

Commit

Permalink
added Index class
Browse files Browse the repository at this point in the history
  • Loading branch information
kerighan committed Sep 8, 2022
1 parent 9a410ee commit 8af016b
Show file tree
Hide file tree
Showing 8 changed files with 367 additions and 4 deletions.
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,39 @@ print(eldar(document))
# >>> True
```

### Building an index for faster queries

Searching in a large corpus using the Query object is slow, as each document has to be checked.
For (much) faster queries, create an `Index` object, and build it using a list of documents.

```python
from eldar import Index
from eldar.trie import Trie

documents = [
"Gandalf is a fictional character in Tolkien's The Lord of the Rings",
"Frodo is the main character in The Lord of the Rings",
"Ian McKellen interpreted Gandalf in Peter Jackson's movies",
"Elijah Wood was cast as Frodo Baggins in Jackson's adaptation",
"The Lord of the Rings is an epic fantasy novel by J. R. R. Tolkien",
"Frodo Baggins is a hobbit"
]

index = Index(ignore_case=True, ignore_accent=True)
index.build(documents) # must only be done once

# persist and retrieve index from disk
index.save("index.p") # but documents are copied to disk
index = Index.load("index.p")

print(index.search('"frodo b*" AND NOT hobbit')) # support wildcards
print(index.count('"frodo b*" AND NOT hobbit')) # shows only the count
# to only return document ids, set `return_ids` to True:
print(index.search('"frodo b*" AND NOT hobbit', return_ids=True))
```

It works like a usual search engine does: by keeping a dictionary that maps each word to its document ids. The boolean query is turned into an operation tree, where document ids are joined or intersected in order to return the desired matches.

## License

This package is MIT licensed.
3 changes: 1 addition & 2 deletions eldar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
__version__ = "0.0.6"

from .query import Query
from .index import Index
67 changes: 67 additions & 0 deletions eldar/entry.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import re
from collections import defaultdict
from dataclasses import dataclass

from .regex import WILD_CARD_REGEX


Expand Down Expand Up @@ -44,7 +47,71 @@ def __repr__(self):
return f'"{self.query}"'


class IndexEntry:
def __init__(self, query_term):
self.not_ = False

if query_term == "*":
raise ValueError(
"Single character wildcards * are not implemented")

if " " in query_term: # multiword query
self.query_term = strip_quotes(query_term).split()
self.search = self.search_multiword
else:
self.query_term = query_term
self.search = self.search_simple

def search_simple(self, index):
res = index.get(self.query_term)
return {match.id for match in res}

def search_multiword(self, index):
docs = defaultdict(list)
for token in self.query_term:
items = index.get(token)
for item in items:
docs[item.id].append((item.position, token))

# utils variable
first_token = self.query_term[0]
query_len = len(self.query_term)
query_rest = self.query_term[1:]
iter_rest = range(1, query_len)

results = set()
for doc_id, tokens in docs.items():
tokens = sorted(tokens)
if len(tokens) < query_len:
continue
for i in range(len(tokens) - query_len + 1):
pos, tok = tokens[i]
if tok != first_token:
continue
is_a_match = True
for j, correct_token in zip(iter_rest, query_rest):
next_pos, next_tok = tokens[i + j]
if correct_token != next_tok or next_pos != pos + j:
is_a_match = False
break
if is_a_match:
results.add(doc_id)
break
return results

def __repr__(self):
if self.not_:
return f'NOT "{self.query_term}"'
return f'"{self.query_term}"'


def strip_quotes(query):
if query[0] == '"' and query[-1] == '"':
return query[1:-1]
return query


@dataclass(unsafe_hash=True, order=True)
class Item:
id: int
position: int
162 changes: 162 additions & 0 deletions eldar/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import re
from collections import defaultdict

from unidecode import unidecode

from .entry import IndexEntry, Item
from .query import is_balanced, strip_brackets
from .regex import WORD_REGEX


class Index:
def __init__(
self,
ignore_case=True,
ignore_accent=True,
use_trie=True
):
self.ignore_case = ignore_case
self.ignore_accent = ignore_accent
self.use_trie = use_trie
self._index = defaultdict(set)

def get(self, query_term):
if query_term == "*":
raise ValueError(
"Single character wildcards * are not implemented")

if "*" not in query_term:
res = self._index.get(query_term, set())
if not isinstance(res, set):
res = set(res)
return res
else:
query_regex = query_term.replace("*", ".*")
if self.use_trie:
matches = self._trie.get(query_term)
matches = [
token for token in matches
if re.match(query_regex, token) is not None
]
else:
matches = [
token for token in self._index
if re.match(query_regex, token) is not None
]
results = set()
for match in matches:
res = self._index[match]
if not isinstance(res, set):
res = set(res)
results.update(res)
return results

def build(self, documents, verbose=False):
self.documents = documents
if verbose:
from tqdm import tqdm
iteration = tqdm(enumerate(documents), total=len(documents))
else:
iteration = enumerate(documents)
for i, document in iteration:
tokens = self.preprocess(document)
for j, token in enumerate(tokens):
self._index[token].add(Item(i, j))

if self.use_trie:
from .trie import Trie
self._trie = Trie()
self._trie.add_tokens(self._index.keys())

def preprocess(self, doc):
if self.ignore_case:
doc = doc.lower()
if self.ignore_accent:
doc = unidecode(doc)
doc = re.findall(WORD_REGEX, doc, re.UNICODE)
return doc

def search(self, query, return_ids=False):
query = parse_query(query,
ignore_case=self.ignore_case,
ignore_accent=self.ignore_accent)
ids = query.search(self)
if return_ids:
return ids

return [self.documents[i] for i in ids]

def count(self, query):
return len(self.search(query, return_ids=True))

def save(self, filename):
import pickle
with open(filename, "wb") as f:
pickle.dump(self, f)

@staticmethod
def load(filename):
import pickle
with open(filename, "rb") as f:
index = pickle.load(f)
return index


def parse_query(query, ignore_case=True, ignore_accent=True):
from .indexops import AND, ANDNOT, OR

# remove brackets around query
if query[0] == '(' and query[-1] == ')':
query = strip_brackets(query)
# if there are quotes around query, make an entry
if query[0] == '"' and query[-1] == '"' and query.count('"') == 1:
if ignore_case:
query = query.lower()
if ignore_accent:
query = unidecode(query)
return IndexEntry(query)

# find all operators
match = []
match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE)
for m in match_iter:
start = m.start(0)
end = m.end(0)
operator = query[start+1:end-1].lower()
match_item = (start, end)
match.append((operator, match_item))
match_len = len(match)

if match_len != 0:
# stop at first balanced operation
for i, (operator, (start, end)) in enumerate(match):
left_part = query[:start]
if not is_balanced(left_part):
continue

right_part = query[end:]
if not is_balanced(right_part):
raise ValueError("Query malformed")
break

if operator == "or":
return OR(
parse_query(left_part, ignore_case, ignore_accent),
parse_query(right_part, ignore_case, ignore_accent)
)
elif operator == "and":
return AND(
parse_query(left_part, ignore_case, ignore_accent),
parse_query(right_part, ignore_case, ignore_accent)
)
elif operator == "and not":
return ANDNOT(
parse_query(left_part, ignore_case, ignore_accent),
parse_query(right_part, ignore_case, ignore_accent)
)
else:
if ignore_case:
query = query.lower()
if ignore_accent:
query = unidecode(query)
return IndexEntry(query)
36 changes: 36 additions & 0 deletions eldar/indexops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@


class Binary:
def __init__(self, left, right):
self.left = left
self.right = right


class AND(Binary):
def search(self, index):
left_match = self.left.search(index)
right_match = self.right.search(index)
return left_match.intersection(right_match)

def __repr__(self):
return f"({self.left}) AND ({self.right})"


class ANDNOT(Binary):
def search(self, index):
left_match = self.left.search(index)
right_match = self.right.search(index)
return left_match.difference(right_match)

def __repr__(self):
return f"({self.left}) AND NOT ({self.right})"


class OR(Binary):
def search(self, index):
left_match = self.left.search(index)
right_match = self.right.search(index)
return left_match.union(right_match)

def __repr__(self):
return f"({self.left}) OR ({self.right})"
45 changes: 45 additions & 0 deletions eldar/trie.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@


class Trie:
def __init__(self):
self.trie = {}

def add_tokens(self, tokens):
for token in tokens:
self.add_token(token)

def add_token(self, token):
leaf = self.trie
for char in token:
if char in leaf:
leaf = leaf[char]
else:
leaf[char] = {}
leaf = leaf[char]
if 1 in leaf:
return
leaf[1] = 1 # the word exist

def get(self, token):
leaf = self.trie
current_str = ""
for char in token:
if char != "*":
if char not in leaf:
return []
leaf = leaf[char]
current_str += char
else:
return self.dfs(current_str, leaf)
if 1 in leaf:
return [token]
return []

def dfs(self, current_str, leaf):
res = []
for key in leaf:
if key == 1:
res.append(current_str)
else:
res.extend(self.dfs(current_str+key, leaf[key]))
return res
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from setuptools import setup, find_packages
import pathlib

from setuptools import find_packages, setup

# The directory containing this file
HERE = pathlib.Path(__file__).parent
README = (HERE / "README.md").read_text()

setup(
name="eldar",
version="0.0.7",
version="0.0.8",
author="Maixent Chenebaux",
author_email="[email protected]",
description="Boolean text search in Python",
Expand Down
Loading

0 comments on commit 8af016b

Please sign in to comment.