Skip to content

Commit

Permalink
Merge pull request #87 from bento-platform/search-improvements
Browse files Browse the repository at this point in the history
Search: #like, #ilike, code docs, major dependency upgrades
  • Loading branch information
davidlougheed authored Feb 17, 2023
2 parents 16a1666 + 3cdf756 commit c99c7a5
Show file tree
Hide file tree
Showing 12 changed files with 609 additions and 326 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Set up Python
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
contents: read

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- uses: actions/setup-python@v4
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ jobs:
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v1
- uses: actions/setup-python@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Set up Python
with:
python-version: ${{ matrix.python-version }}
Expand Down
9 changes: 6 additions & 3 deletions bento_lib/events/_event_bus.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def __init__(self, allow_fake: bool = False, **kwargs):
raise e
logger.warning(f"Starting event bus in 'fake' mode (tried connection data: {connection_data})")

self._ps: Optional[redis.PubSub] = None
self._ps: Optional[redis.client.PubSub] = None

self._ps_handlers: dict[str, Callable[[dict], None]] = {}
self._event_thread: Optional[redis.PubSubWorkerThread] = None
self._event_thread: Optional[redis.client.PubSubWorkerThread] = None

self._service_event_types: Dict[str, dict] = {}
self._data_type_event_types: Dict[str, dict] = {}
Expand Down Expand Up @@ -101,7 +101,10 @@ def start_event_loop(self) -> None:
self._logger.debug("Starting EventBus event loop")

self._ps = self._rc.pubsub()
self._ps.psubscribe(**self._ps_handlers)

if self._ps_handlers: # Only try to subscribe if we have any registered handlers
self._ps.psubscribe(**self._ps_handlers)

self._event_thread = self._ps.run_in_thread(sleep_time=0.001, daemon=True)

def stop_event_loop(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion bento_lib/package.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = bento_lib
version = 5.4.0
version = 6.0.0a1
authors = David Lougheed, Paul Pillot
author_emails = [email protected], [email protected]
125 changes: 100 additions & 25 deletions bento_lib/search/data_structure.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import re

import jsonschema

from functools import partial
from itertools import chain, product, starmap
from operator import and_, or_, not_, lt, le, eq, gt, ge, contains, is_not
from typing import Callable, Dict, Iterable, Optional, Tuple, Union
from typing import Callable, Dict, List, Iterable, Optional, Tuple, Union

from . import queries as q
from ._types import JSONSchema
Expand All @@ -20,7 +22,7 @@
ArrayLengthData = Tuple[str, int, Tuple["ArrayLengthData", ...]]


def _icontains(lhs: str, rhs: str):
def _icontains(lhs: str, rhs: str) -> bool:
"""
Same as the "contains" operator, except with case-folded (i.e. case
insensitive) arguments.
Expand All @@ -31,7 +33,7 @@ def _icontains(lhs: str, rhs: str):
return contains(lhs.casefold(), rhs.casefold())


def _in(lhs: Union[str, int, float], rhs: QueryableStructure):
def _in(lhs: Union[str, int, float], rhs: QueryableStructure) -> bool:
"""
Same as `contains`, except order of arguments is inverted and second
argument is a set.
Expand All @@ -42,32 +44,84 @@ def _in(lhs: Union[str, int, float], rhs: QueryableStructure):
return contains(rhs, lhs)


def _i_starts_with(lhs: str, rhs: str):
def _i_starts_with(lhs: str, rhs: str) -> bool:
"""
Checks whether a string starts with a particular prefix, in a case-insensitive fashion.
:param lhs: The full string to assess.
:param rhs: The prefix to test against LHS.
:return: Whether the string starts with the prefix.
"""
if not isinstance(lhs, str) or not isinstance(rhs, str):
raise TypeError("#isw can only be used with strings")
raise TypeError(f"{q.FUNCTION_ISW} can only be used with strings")
return lhs.casefold().startswith(rhs.casefold())


def _i_ends_with(lhs: str, rhs: str):
def _i_ends_with(lhs: str, rhs: str) -> bool:
"""
Checks whether a string ends with a particular suffix, in a case-insensitive fashion.
:param lhs: The full string to assess.
:param rhs: The prefix to test against LHS.
:return: Whether the string ends with the prefix.
"""
if not isinstance(lhs, str) or not isinstance(rhs, str):
raise TypeError("#iew can only be used with strings")
raise TypeError(f"{q.FUNCTION_IEW} can only be used with strings")
return lhs.casefold().endswith(rhs.casefold())


# See, e.g., https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
REGEX_CHARS_TO_ESCAPE = frozenset({"[", "]", "(", ")", "{", "}", "\\", ".", "^", "$", "*", "+", "-", "?", "|"})


def regex_from_like_pattern(pattern: str, case_insensitive: bool) -> re.Pattern:
"""
Converts an SQL-style match pattern with %/_ wildcards into a Python Regex object.
:param pattern: The SQL-style match pattern to convert.
:param case_insensitive: Whether the generated Regex should be case-insensitive.
:return: The converted Regex object.
"""

# - Replace % with (.*) if % is not preceded by a \
# - Wrap with ^$ to replicate whole-string behaviour
# - Escape any special Regex characters

regex_form: List[str] = ["^"]
escape_mode: bool = False
for char in pattern:
# Put us into escape mode, so that the next character is escaped if needed
if char == "\\":
escape_mode = True
continue

if char == "%": # Matches any number of characters
# If we're in escape mode, append the literal %. Otherwise, replace it with a wildcard pattern.
regex_form.append("%" if escape_mode else "(.*)")
elif char == "_": # Match a single character
regex_form.append("_" if escape_mode else ".")
elif char in REGEX_CHARS_TO_ESCAPE:
# Escape special Regex characters with a backslash while building pattern
regex_form.append(rf"\{char}")
else:
regex_form.append(char) # Unmodified if not special

escape_mode = False # Turn off escape mode after one iteration; it only applies to the character in front of it

regex_form.append("$")

return re.compile("".join(regex_form), *((re.IGNORECASE,) if case_insensitive else ()))


def _like_op(case_insensitive: bool):
def like_inner(lhs, rhs) -> bool:
if not isinstance(lhs, str) or not isinstance(rhs, str):
raise TypeError(f"{q.FUNCTION_LIKE} can only be used with strings")

return regex_from_like_pattern(rhs, case_insensitive).match(lhs) is not None

return like_inner


def _validate_data_structure_against_schema(
data_structure: QueryableStructure, schema: JSONSchema, secure_errors: bool = True):
data_structure: QueryableStructure, schema: JSONSchema, secure_errors: bool = True) -> None:
"""
Validates a queryable data structure of some type against a JSON schema. This is an important validation step,
because (assuming the schema is correct) it allows methods to make more assumptions about the integrity of the
Expand Down Expand Up @@ -100,7 +154,7 @@ def _validate_data_structure_against_schema(
f"{errors_str}")


def _validate_not_wc(e: q.AST):
def _validate_not_wc(e: q.AST) -> None:
"""
The #_wc (wildcard) expression function is a helper for converting the queries into the Postgres IR. If we encounter
this function in a query being evaluated against a data structure, it's meaningless and should raise an error.
Expand Down Expand Up @@ -166,7 +220,7 @@ def evaluate(
check_permissions: bool = True,
secure_errors: bool = True,
):
# The validate flag is used to avoid redundantly validating the integrity of child data structures
# The 'validate' flag is used to avoid redundantly validating the integrity of child data structures
_validate_data_structure_against_schema(data_structure, schema, secure_errors=secure_errors)
return evaluate_no_validate(ast, data_structure, schema, index_combination, internal, resolve_checks,
check_permissions)
Expand Down Expand Up @@ -328,9 +382,15 @@ def _binary_op(op: BBOperator)\
is_and = op == and_
is_or = op == or_

def uncurried_binary_op(args: q.Args, ds: QueryableStructure, schema: JSONSchema,
ic: Optional[IndexCombination], internal: bool, resolve_checks: bool,
check_permissions: bool) -> bool:
def uncurried_binary_op(
args: q.Args,
ds: QueryableStructure,
schema: JSONSchema,
ic: Optional[IndexCombination],
internal: bool,
resolve_checks: bool,
check_permissions: bool
) -> bool:
# TODO: Standardize type safety / behaviour!!!

# Evaluate both sides of the binary expression. If there's a type error while trying to use a Python built-in,
Expand Down Expand Up @@ -379,7 +439,7 @@ def _resolve_checks(resolve_value: str, schema: JSONSchema):


def _get_child_resolve_array_lengths(
new_resolve: Tuple[q.Literal, ...],
new_resolve: Tuple[q.AST, ...],
resolving_ds: list,
item_schema: JSONSchema,
new_path: str,
Expand All @@ -401,7 +461,7 @@ def _get_child_resolve_array_lengths(


def _resolve_array_lengths(
resolve: Tuple[q.Literal, ...],
resolve: Tuple[q.AST, ...],
resolving_ds: QueryableStructure,
schema: JSONSchema,
path: str = "_root",
Expand Down Expand Up @@ -435,10 +495,11 @@ def _resolve_array_lengths(

# The current data structure is an array, so return its length and recurse on its (potential) child arrays.
if resolve[0].value == "[item]":
return (path,
len(resolving_ds),
tuple(_get_child_resolve_array_lengths(resolve[1:], resolving_ds, schema["items"], new_path,
resolve_checks)))
return (
path,
len(resolving_ds),
tuple(
_get_child_resolve_array_lengths(resolve[1:], resolving_ds, schema["items"], new_path, resolve_checks)))

# Otherwise, it's an object, so keep traversing without doing anything
return _resolve_array_lengths(resolve[1:], resolving_ds[resolve_value], schema["properties"][resolve_value],
Expand Down Expand Up @@ -477,9 +538,15 @@ def _resolve_properties_and_check(
return r_schema.get("search", {})


def _resolve(resolve: Tuple[q.Literal, ...], resolving_ds: QueryableStructure, _schema: JSONSchema,
index_combination: Optional[IndexCombination], _internal, _resolve_checks, _check_permissions) \
-> QueryableStructure:
def _resolve(
resolve: Tuple[q.Literal, ...],
resolving_ds: QueryableStructure,
_schema: JSONSchema,
index_combination: Optional[IndexCombination],
_internal: bool,
_resolve_checks: bool,
_check_permissions: bool,
) -> QueryableStructure:
"""
Resolves / evaluates a path (either object or array) into a value. Assumes the data structure has already been
checked against its schema.
Expand All @@ -500,9 +567,15 @@ def _resolve(resolve: Tuple[q.Literal, ...], resolving_ds: QueryableStructure, _
return resolving_ds


def _list(literals: Tuple[q.Literal, ...], resolving_ds: QueryableStructure, _schema: JSONSchema,
index_combination: Optional[IndexCombination], _internal, _resolve_checks, _check_permissions) \
-> QueryableStructure:
def _list(
literals: Tuple[q.Literal, ...],
_resolving_ds: QueryableStructure,
_schema: JSONSchema,
_index_combination: Optional[IndexCombination],
_internal: bool,
_resolve_checks: bool,
_check_permissions: bool,
) -> QueryableStructure:
"""
This function is to be used in conjonction with the #in operator to check
for matches in a set of literals. (e.g. individual.karyotypic_sex in {"XX", "X0", "XXX"})
Expand Down Expand Up @@ -533,6 +606,8 @@ def _list(literals: Tuple[q.Literal, ...], resolving_ds: QueryableStructure, _sc

q.FUNCTION_ISW: _binary_op(_i_starts_with),
q.FUNCTION_IEW: _binary_op(_i_ends_with),
q.FUNCTION_LIKE: _binary_op(_like_op(case_insensitive=False)),
q.FUNCTION_ILIKE: _binary_op(_like_op(case_insensitive=True)),

q.FUNCTION_RESOLVE: _resolve,
q.FUNCTION_LIST: _list,
Expand Down
7 changes: 7 additions & 0 deletions bento_lib/search/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

"SEARCH_OP_ISW",
"SEARCH_OP_IEW",
"SEARCH_OP_LIKE",
"SEARCH_OP_ILIKE",

"SEARCH_OPERATIONS",
]
Expand All @@ -30,6 +32,8 @@

SEARCH_OP_ISW = "isw"
SEARCH_OP_IEW = "iew"
SEARCH_OP_LIKE = "like"
SEARCH_OP_ILIKE = "ilike"

SEARCH_OPERATIONS = (
SEARCH_OP_EQ,
Expand All @@ -42,4 +46,7 @@
SEARCH_OP_ICO,
SEARCH_OP_ISW,
SEARCH_OP_IEW,
SEARCH_OP_LIKE,
SEARCH_OP_ILIKE,
SEARCH_OP_ILIKE,
)
Loading

0 comments on commit c99c7a5

Please sign in to comment.