-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
367 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,2 @@ | ||
__version__ = "0.0.6" | ||
|
||
from .query import Query | ||
from .index import Index |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
import re | ||
from collections import defaultdict | ||
|
||
from unidecode import unidecode | ||
|
||
from .entry import IndexEntry, Item | ||
from .query import is_balanced, strip_brackets | ||
from .regex import WORD_REGEX | ||
|
||
|
||
class Index: | ||
def __init__( | ||
self, | ||
ignore_case=True, | ||
ignore_accent=True, | ||
use_trie=True | ||
): | ||
self.ignore_case = ignore_case | ||
self.ignore_accent = ignore_accent | ||
self.use_trie = use_trie | ||
self._index = defaultdict(set) | ||
|
||
def get(self, query_term): | ||
if query_term == "*": | ||
raise ValueError( | ||
"Single character wildcards * are not implemented") | ||
|
||
if "*" not in query_term: | ||
res = self._index.get(query_term, set()) | ||
if not isinstance(res, set): | ||
res = set(res) | ||
return res | ||
else: | ||
query_regex = query_term.replace("*", ".*") | ||
if self.use_trie: | ||
matches = self._trie.get(query_term) | ||
matches = [ | ||
token for token in matches | ||
if re.match(query_regex, token) is not None | ||
] | ||
else: | ||
matches = [ | ||
token for token in self._index | ||
if re.match(query_regex, token) is not None | ||
] | ||
results = set() | ||
for match in matches: | ||
res = self._index[match] | ||
if not isinstance(res, set): | ||
res = set(res) | ||
results.update(res) | ||
return results | ||
|
||
def build(self, documents, verbose=False): | ||
self.documents = documents | ||
if verbose: | ||
from tqdm import tqdm | ||
iteration = tqdm(enumerate(documents), total=len(documents)) | ||
else: | ||
iteration = enumerate(documents) | ||
for i, document in iteration: | ||
tokens = self.preprocess(document) | ||
for j, token in enumerate(tokens): | ||
self._index[token].add(Item(i, j)) | ||
|
||
if self.use_trie: | ||
from .trie import Trie | ||
self._trie = Trie() | ||
self._trie.add_tokens(self._index.keys()) | ||
|
||
def preprocess(self, doc): | ||
if self.ignore_case: | ||
doc = doc.lower() | ||
if self.ignore_accent: | ||
doc = unidecode(doc) | ||
doc = re.findall(WORD_REGEX, doc, re.UNICODE) | ||
return doc | ||
|
||
def search(self, query, return_ids=False): | ||
query = parse_query(query, | ||
ignore_case=self.ignore_case, | ||
ignore_accent=self.ignore_accent) | ||
ids = query.search(self) | ||
if return_ids: | ||
return ids | ||
|
||
return [self.documents[i] for i in ids] | ||
|
||
def count(self, query): | ||
return len(self.search(query, return_ids=True)) | ||
|
||
def save(self, filename): | ||
import pickle | ||
with open(filename, "wb") as f: | ||
pickle.dump(self, f) | ||
|
||
@staticmethod | ||
def load(filename): | ||
import pickle | ||
with open(filename, "rb") as f: | ||
index = pickle.load(f) | ||
return index | ||
|
||
|
||
def parse_query(query, ignore_case=True, ignore_accent=True): | ||
from .indexops import AND, ANDNOT, OR | ||
|
||
# remove brackets around query | ||
if query[0] == '(' and query[-1] == ')': | ||
query = strip_brackets(query) | ||
# if there are quotes around query, make an entry | ||
if query[0] == '"' and query[-1] == '"' and query.count('"') == 1: | ||
if ignore_case: | ||
query = query.lower() | ||
if ignore_accent: | ||
query = unidecode(query) | ||
return IndexEntry(query) | ||
|
||
# find all operators | ||
match = [] | ||
match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE) | ||
for m in match_iter: | ||
start = m.start(0) | ||
end = m.end(0) | ||
operator = query[start+1:end-1].lower() | ||
match_item = (start, end) | ||
match.append((operator, match_item)) | ||
match_len = len(match) | ||
|
||
if match_len != 0: | ||
# stop at first balanced operation | ||
for i, (operator, (start, end)) in enumerate(match): | ||
left_part = query[:start] | ||
if not is_balanced(left_part): | ||
continue | ||
|
||
right_part = query[end:] | ||
if not is_balanced(right_part): | ||
raise ValueError("Query malformed") | ||
break | ||
|
||
if operator == "or": | ||
return OR( | ||
parse_query(left_part, ignore_case, ignore_accent), | ||
parse_query(right_part, ignore_case, ignore_accent) | ||
) | ||
elif operator == "and": | ||
return AND( | ||
parse_query(left_part, ignore_case, ignore_accent), | ||
parse_query(right_part, ignore_case, ignore_accent) | ||
) | ||
elif operator == "and not": | ||
return ANDNOT( | ||
parse_query(left_part, ignore_case, ignore_accent), | ||
parse_query(right_part, ignore_case, ignore_accent) | ||
) | ||
else: | ||
if ignore_case: | ||
query = query.lower() | ||
if ignore_accent: | ||
query = unidecode(query) | ||
return IndexEntry(query) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
|
||
|
||
class Binary: | ||
def __init__(self, left, right): | ||
self.left = left | ||
self.right = right | ||
|
||
|
||
class AND(Binary): | ||
def search(self, index): | ||
left_match = self.left.search(index) | ||
right_match = self.right.search(index) | ||
return left_match.intersection(right_match) | ||
|
||
def __repr__(self): | ||
return f"({self.left}) AND ({self.right})" | ||
|
||
|
||
class ANDNOT(Binary): | ||
def search(self, index): | ||
left_match = self.left.search(index) | ||
right_match = self.right.search(index) | ||
return left_match.difference(right_match) | ||
|
||
def __repr__(self): | ||
return f"({self.left}) AND NOT ({self.right})" | ||
|
||
|
||
class OR(Binary): | ||
def search(self, index): | ||
left_match = self.left.search(index) | ||
right_match = self.right.search(index) | ||
return left_match.union(right_match) | ||
|
||
def __repr__(self): | ||
return f"({self.left}) OR ({self.right})" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
|
||
|
||
class Trie: | ||
def __init__(self): | ||
self.trie = {} | ||
|
||
def add_tokens(self, tokens): | ||
for token in tokens: | ||
self.add_token(token) | ||
|
||
def add_token(self, token): | ||
leaf = self.trie | ||
for char in token: | ||
if char in leaf: | ||
leaf = leaf[char] | ||
else: | ||
leaf[char] = {} | ||
leaf = leaf[char] | ||
if 1 in leaf: | ||
return | ||
leaf[1] = 1 # the word exist | ||
|
||
def get(self, token): | ||
leaf = self.trie | ||
current_str = "" | ||
for char in token: | ||
if char != "*": | ||
if char not in leaf: | ||
return [] | ||
leaf = leaf[char] | ||
current_str += char | ||
else: | ||
return self.dfs(current_str, leaf) | ||
if 1 in leaf: | ||
return [token] | ||
return [] | ||
|
||
def dfs(self, current_str, leaf): | ||
res = [] | ||
for key in leaf: | ||
if key == 1: | ||
res.append(current_str) | ||
else: | ||
res.extend(self.dfs(current_str+key, leaf[key])) | ||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,14 @@ | ||
from setuptools import setup, find_packages | ||
import pathlib | ||
|
||
from setuptools import find_packages, setup | ||
|
||
# The directory containing this file | ||
HERE = pathlib.Path(__file__).parent | ||
README = (HERE / "README.md").read_text() | ||
|
||
setup( | ||
name="eldar", | ||
version="0.0.7", | ||
version="0.0.8", | ||
author="Maixent Chenebaux", | ||
author_email="[email protected]", | ||
description="Boolean text search in Python", | ||
|
Oops, something went wrong.