Skip to content

Commit

Permalink
added MIT License
Browse files Browse the repository at this point in the history
  • Loading branch information
kerighan committed Mar 8, 2022
2 parents f506423 + 436277f commit 9a410ee
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 210 deletions.
21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,23 @@ Let the query be ```query = '"movie"'```:
* If `ignore_accent` is True, the documents "mövie" will be matched.
* If `match_word` is True, the document will be tokenized and the query terms will have to match exactly. If set to False, the documents "movies" and "movie" will be matched. Setting this option to True may slow down the query.

## License
### Wildcards

This package is MIT licensed.
Queries also support `*` as wildcard character. Wildcard matches any number (including none) of alphanumeric characters.

## Authors
```python
from eldar import Query

Maixent Chenebaux

# sample document and query with multiple wildcards:
document = "Gandalf is a fictional character in Tolkien's The Lord of the Rings"
eldar = Query('"g*dal*"')

# call to see if the text matches the query:
print(eldar(document))
# >>> True
```

## License

This package is MIT licensed.
206 changes: 2 additions & 204 deletions eldar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,205 +1,3 @@
from unidecode import unidecode
import re
__version__ = "0.0.6"


__version__ = "0.0.5"


word_regex = r'([\w]+|[,?;.:\/!()\[\]\'"’\\><+-=])'


class Query:
def __init__(
self,
query,
ignore_case=True,
ignore_accent=True,
match_word=True
):
self.ignore_case = ignore_case
self.ignore_accent = ignore_accent
self.match_word = match_word
self.query = parse_query(query, ignore_case, ignore_accent)

def preprocess(self, doc):
if self.ignore_case:
doc = doc.lower()
if self.ignore_accent:
doc = unidecode(doc)
if self.match_word:
doc = set(re.findall(word_regex, doc, re.UNICODE))
return doc

def evaluate(self, doc):
doc = self.preprocess(doc)
return self.query.evaluate(doc)

def filter(self, documents):
docs = []
for doc in documents:
if not self.evaluate(doc):
continue
docs.append(doc)
return docs

def __call__(self, doc):
return self.evaluate(doc)

def __repr__(self):
return self.query.__repr__()


class Binary:
def __init__(self, left, right):
self.left = left
self.right = right


class AND(Binary):
def evaluate(self, doc):
left_match = self.left.evaluate(doc)
if not left_match:
return False
right_match = self.right.evaluate(doc)
if not right_match:
return False
return True

def __repr__(self):
return f"({self.left}) AND ({self.right})"


class ANDNOT(Binary):
def evaluate(self, doc):
left_match = self.left.evaluate(doc)
if not left_match:
return False
right_match = self.right.evaluate(doc)
return not right_match

def __repr__(self):
return f"({self.left}) AND NOT ({self.right})"


# class NOT(Binary):
# def evaluate(self, doc):



class OR(Binary):
def evaluate(self, doc):
if self.left.evaluate(doc):
return True
if self.right.evaluate(doc):
return True
return False

def __repr__(self):
return f"({self.left}) OR ({self.right})"


class Entry:
def __init__(self, query):
self.not_ = False
if query[:4] == "not ":
self.not_ = True
query = query[4:]
self.query = strip_quotes(query)

def evaluate(self, doc):
res = self.query in doc
if self.not_:
return not res
return res

def __repr__(self):
if self.not_:
return f'NOT "{self.query}"'
return f'"{self.query}"'


def parse_query(query, ignore_case=True, ignore_accent=True):
# remove brackets around query
if query[0] == '(' and query[-1] == ')':
query = strip_brackets(query)
# if there are quotes around query, make an entry
if query[0] == '"' and query[-1] == '"' and query.count('"') == 1:
if ignore_case:
query = query.lower()
if ignore_accent:
query = unidecode(query)
return Entry(query)

# find all operators
match = []
match_iter = re.finditer(r" (AND NOT|AND|OR) ", query, re.IGNORECASE)
for m in match_iter:
start = m.start(0)
end = m.end(0)
operator = query[start+1:end-1].lower()
match_item = (start, end)
match.append((operator, match_item))
match_len = len(match)

if match_len != 0:
# stop at first balanced operation
for i, (operator, (start, end)) in enumerate(match):
left_part = query[:start]
if not is_balanced(left_part):
continue

right_part = query[end:]
if not is_balanced(right_part):
raise ValueError("Query malformed")
break

if operator == "or":
return OR(
parse_query(left_part, ignore_case, ignore_accent),
parse_query(right_part, ignore_case, ignore_accent)
)
elif operator == "and":
return AND(
parse_query(left_part, ignore_case, ignore_accent),
parse_query(right_part, ignore_case, ignore_accent)
)
elif operator == "and not":
return ANDNOT(
parse_query(left_part, ignore_case, ignore_accent),
parse_query(right_part, ignore_case, ignore_accent)
)
else:
if ignore_case:
query = query.lower()
if ignore_accent:
query = unidecode(query)
return Entry(query)


def strip_brackets(query):
count_left = 0
for i in range(len(query) - 1):
letter = query[i]
if letter == "(":
count_left += 1
elif letter == ")":
count_left -= 1
if i > 0 and count_left == 0:
return query

if query[0] == "(" and query[-1] == ")":
return query[1:-1]
return query


def strip_quotes(query):
if query[0] == '"' and query[-1] == '"':
return query[1:-1]
return query


def is_balanced(query):
# are brackets balanced
brackets_b = query.count("(") == query.count(")")
quotes_b = query.count('"') % 2 == 0
return brackets_b and quotes_b
from .query import Query
50 changes: 50 additions & 0 deletions eldar/entry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import re
from .regex import WILD_CARD_REGEX


class Entry:
def __init__(self, query):
self.not_ = False

if query[:4] == "not ":
self.not_ = True
query = query[4:]

self.query = strip_quotes(query)

if "*" in self.query:
self.pattern = self.query.replace("*", WILD_CARD_REGEX)
self.rgx = re.compile(self.pattern)
else:
self.rgx = None

def evaluate(self, doc):
if self.rgx:

if isinstance(doc, str):
doc = [doc]

for item in doc:
if self.rgx.match(item):
res = True
break
else:
res = False
else:
res = self.query in doc

if self.not_:
return not res

return res

def __repr__(self):
if self.not_:
return f'NOT "{self.query}"'
return f'"{self.query}"'


def strip_quotes(query):
if query[0] == '"' and query[-1] == '"':
return query[1:-1]
return query
44 changes: 44 additions & 0 deletions eldar/operators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@


class Binary:
def __init__(self, left, right):
self.left = left
self.right = right


class AND(Binary):
def evaluate(self, doc):
left_match = self.left.evaluate(doc)
if not left_match:
return False
right_match = self.right.evaluate(doc)
if not right_match:
return False
return True

def __repr__(self):
return f"({self.left}) AND ({self.right})"


class ANDNOT(Binary):
def evaluate(self, doc):
left_match = self.left.evaluate(doc)
if not left_match:
return False
right_match = self.right.evaluate(doc)
return not right_match

def __repr__(self):
return f"({self.left}) AND NOT ({self.right})"


class OR(Binary):
def evaluate(self, doc):
if self.left.evaluate(doc):
return True
if self.right.evaluate(doc):
return True
return False

def __repr__(self):
return f"({self.left}) OR ({self.right})"
Loading

0 comments on commit 9a410ee

Please sign in to comment.