Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Oct 12, 2023
1 parent fccca83 commit 0076d76
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 81 deletions.
144 changes: 87 additions & 57 deletions share/search/index_strategy/trove_indexcard_flats.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Textsegment,
SortParam,
PageParam,
GLOB_PATHSTEP,
)
from share.search.search_response import (
CardsearchResponse,
Expand Down Expand Up @@ -398,7 +399,6 @@ def _cardsearch_query(
else:
raise ValueError(f'unknown filter operator {_searchfilter.operator}')
_textq_builder = self._NestedTextQueryBuilder(
inner_hits_factory=self._cardsearch_inner_hits,
relevance_matters=bool(cardsearch_cursor and not cardsearch_cursor.random_sort),
)
for _textsegment in textsegment_set:
Expand Down Expand Up @@ -434,23 +434,6 @@ def _cardsearch_query(
},
}

def _cardsearch_inner_hits(self, *, highlight_query=None) -> dict:
_highlight = {
'type': 'unified',
'fields': {'nested_text.text_value': {}},
}
if highlight_query is not None:
_highlight['highlight_query'] = highlight_query
return {
'name': str(uuid.uuid4()), # avoid inner-hit name collisions
'highlight': _highlight,
'_source': False, # _source is expensive for nested docs
'docvalue_fields': [
'nested_text.path_from_focus',
'nested_text.language_iri',
],
}

def _cardsearch_aggs(self, cardsearch_params):
_aggs = {}
if cardsearch_params.related_property_paths:
Expand Down Expand Up @@ -650,9 +633,17 @@ def _cardsearch_presence_query(self, search_filter) -> dict:
]
if len(_filters) == 1:
return _filters[0]
return {'bool': {'must': _filters}}
return {'bool': {
'min_should_match': 1,
'should': _filters,
}}

def _cardsearch_path_presence_query(self, path: tuple[str, ...]):
if all(_pathstep == GLOB_PATHSTEP for _pathstep in path):
return {'nested': {
'path': 'nested_iri',
'query': {'term': {'nested_iri.distance_from_focus': len(path)}},
}}
return {'term': {
'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True),
}}
Expand All @@ -664,44 +655,58 @@ def _cardsearch_iri_filter(self, search_filter) -> dict:
]
if len(_filters) == 1:
return _filters[0]
return {'bool': {'should': _filters}} # at least one match
return {'bool': {
'min_should_match': 1,
'should': _filters,
}}

def _cardsearch_path_iri_query(self, path, value_set):
return {'terms': {_iri_path_as_flattened_field(path): [
_suffuniq_values = [
get_sufficiently_unique_iri(_iri)
for _iri in value_set
]}}

def _cardsearch_date_filter(self, search_filter) -> dict:
_filter_list = [
{'terms': {'nested_date.suffuniq_path_from_focus': [
iri_path_as_keyword(_path, suffuniq=True)
for _path in search_filter.propertypath_set
]}},
]
if all(_pathstep == GLOB_PATHSTEP for _pathstep in path):
return {'nested': {
'path': 'nested_iri',
'query': {'bool': {
'must': [ # both
{'term': {'nested_iri.distance_from_focus': len(path)}},
{'terms': {'nested_iri.flat_iri_values_suffuniq': _suffuniq_values}},
],
}},
}}
# without a glob-path, can use the flattened keyword field
return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}}

def _cardsearch_date_filter(self, search_filter):
return {'nested': {
'path': 'nested_date',
'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}},
}}

def _iter_nested_date_filters(self, search_filter) -> dict:
# filter by requested paths
yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date')
# filter by requested value/operator
if search_filter.operator == SearchFilter.FilterOperator.BEFORE:
_value = min(search_filter.value_set) # rely on string-comparable isoformat
_filter_list.append({'range': {'nested_date.date_value': {
yield {'range': {'nested_date.date_value': {
'lt': _daterange_value_and_format(_value)
}}})
}}}
elif search_filter.operator == SearchFilter.FilterOperator.AFTER:
_value = max(search_filter.value_set) # rely on string-comparable isoformat
_filter_list.append({'range': {'nested_date.date_value': {
yield {'range': {'nested_date.date_value': {
'gt': _daterange_value_and_format(_value)
}}})
}}}
elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE:
for _value in search_filter.value_set:
_filtervalue = _daterange_value_and_format(_value)
_filter_list.append({'range': {'nested_date.date_value': {
yield {'range': {'nested_date.date_value': {
'gte': _filtervalue,
'lte': _filtervalue,
}}})
}}}
else:
raise ValueError(f'invalid date filter operator (got {search_filter.operator})')
return {'nested': {
'path': 'nested_date',
'query': {'bool': {'filter': _filter_list}},
}}

def _cardsearch_sort(self, sort_list: tuple[SortParam]):
if not sort_list:
Expand Down Expand Up @@ -860,9 +865,8 @@ def fuzzy_text_should_query(self, text: str):
}}

class _NestedTextQueryBuilder(_SimpleTextQueryBuilder):
def __init__(self, inner_hits_factory=None, **kwargs):
def __init__(self, **kwargs):
super().__init__('nested_text.text_value', **kwargs)
self._inner_hits_factory = inner_hits_factory

def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]:
return {
Expand All @@ -873,30 +877,35 @@ def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]:
for _boolkey, _queries in super().textsegment_boolparts(textsegment).items()
}

def _textsegment_filter(self, textsegment):
if textsegment.propertypath_set:
return {'terms': {
'nested_text.suffuniq_path_from_focus': [
iri_path_as_keyword(_path, suffuniq=True)
for _path in textsegment.propertypath_set
],
}}
return {'term': {
'nested_text.distance_from_focus': textsegment.propertypath_depth,
}}

def _make_nested_query(self, textsegment, query, *, with_inner_hits=False):
def _make_nested_query(self, textsegment, query):
_nested_q = {'nested': {
'path': self._nested_path,
'query': {'bool': {
'filter': self._textsegment_filter(textsegment),
'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'),
'must': query,
}},
}}
if with_inner_hits and self._inner_hits_factory:
_nested_q['nested']['inner_hits'] = self._inner_hits_factory()
if self._relevance_matters:
_nested_q['nested']['inner_hits'] = self._inner_hits()
return _nested_q

def _inner_hits(self, *, highlight_query=None) -> dict:
_highlight = {
'type': 'unified',
'fields': {'nested_text.text_value': {}},
}
if highlight_query is not None:
_highlight['highlight_query'] = highlight_query
return {
'name': str(uuid.uuid4()), # avoid inner-hit name collisions
'highlight': _highlight,
'_source': False, # _source is expensive for nested docs
'docvalue_fields': [
'nested_text.path_from_focus',
'nested_text.language_iri',
],
}


###
# module-local utils
Expand Down Expand Up @@ -942,6 +951,27 @@ def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str:
return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}'


def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], nested_path: str):
_suffuniq_iri_paths = []
_glob_path_lengths = []
for _path in propertypath_set:
if all(_pathstep == GLOB_PATHSTEP for _pathstep in _path):
_glob_path_lengths.append(len(_path))
else:
_suffuniq_iri_paths.append(iri_path_as_keyword(_path, suffuniq=True))
if _suffuniq_iri_paths and _glob_path_lengths:
return {'bool': {
'min_should_match': 1,
'should': [
{'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}},
{'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}},
],
}}
if _glob_path_lengths:
return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}
return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}


@dataclasses.dataclass
class _SimpleCursor:
start_index: int
Expand Down
66 changes: 42 additions & 24 deletions share/search/search_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,37 @@


###
# special characters in search text:
# constants for use in query param parsing

# special characters in "...SearchText" values
NEGATE_WORD_OR_PHRASE = '-'
DOUBLE_QUOTATION_MARK = '"'

# optional prefix for "sort" values
DESCENDING_SORT_PREFIX = '-'

# for "page[size]" values
DEFAULT_PAGE_SIZE = 13
MAX_PAGE_SIZE = 101

# between each step in a property path "foo.bar.baz"
PROPERTYPATH_DELIMITER = '.'

# special path-step that matches all properties
GLOB_PATHSTEP = '*'


###
# dataclasses for parsed search-api query parameters


@dataclasses.dataclass(frozen=True)
class Textsegment:
text: str
is_fuzzy: bool = True
is_negated: bool = False
is_openended: bool = False
propertypath_depth: int = 1 # ignored if propertypath_set
propertypath_set: None | frozenset[tuple[str, ...]] = None
propertypath_set: frozenset[tuple[str, ...]] = frozenset((GLOB_PATHSTEP,))

def __post_init__(self):
if self.is_negated and self.is_fuzzy:
Expand All @@ -61,24 +70,23 @@ def words(self):

@classmethod
def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str):
return frozenset(
cls.from_searchtext_param(param_name, param_value)
for (param_name, param_value)
in queryparams.get(queryparam_family, ())
)
return frozenset(cls.iter_from_queryparam_family(queryparams, queryparam_family))

@classmethod
def from_searchtext_param(cls, param_name: QueryparamName, param_value: str):
if param_name.bracketed_names:
_propertypath_set = _parse_propertypath_set(param_name.bracketed_names[0])
return cls(
def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str):
for (_param_name, _param_value) in queryparams.get(queryparam_family, ()):
yield from cls.iter_from_searchtext_param(_param_name, _param_value)

@classmethod
def split_str(cls, text: str) -> frozenset['Textsegment']:
return frozenset(cls._split_str(text))
def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str):
if param_name.bracketed_names:
yield from cls.iter_from_text(
param_value,
propertypath_set=_parse_propertypath_set(param_name.bracketed_names[0]),
)

@classmethod
def _split_str(cls, text: str) -> typing.Iterable['Textsegment']:
def iter_from_text(cls, text: str, **textsegment_kwargs) -> typing.Iterable['Textsegment']:
'''parse search text into words and quoted phrases
'''
_in_quotes = False
Expand Down Expand Up @@ -184,9 +192,9 @@ def is_iri_operator(self):
def is_valueless_operator(self):
return self in (self.IS_PRESENT, self.IS_ABSENT)

propertypath_set: frozenset[tuple[str, ...]]
value_set: frozenset[str]
operator: FilterOperator
value_set: frozenset[str]
propertypath_set: frozenset[tuple[str, ...]] = frozenset((GLOB_PATHSTEP,))
original_param_name: typing.Optional[str] = None
original_param_value: typing.Optional[str] = None

Expand Down Expand Up @@ -243,9 +251,9 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str):
else:
_value_list.append(_iri)
return cls(
propertypath_set=_propertypath_set,
value_set=frozenset(_value_list),
operator=_operator,
propertypath_set=_propertypath_set,
original_param_name=str(param_name),
original_param_value=param_value,
)
Expand Down Expand Up @@ -390,7 +398,7 @@ def from_queryparams(queryparams: QueryparamDict) -> 'ValuesearchParams':
raise ValueError('TODO: 400 valueSearchPropertyPath required')
return ValuesearchParams(
**CardsearchParams.parse_cardsearch_queryparams(queryparams),
valuesearch_propertypath_set=_parse_propertypath_set(_raw_propertypath),
valuesearch_propertypath_set=_parse_propertypath_set(_raw_propertypath, allow_globs=False),
valuesearch_text=_valuesearch_text,
valuesearch_textsegment_set=Textsegment.from_queryparam_family(queryparams, 'valueSearchText'),
valuesearch_filter_set=SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'),
Expand Down Expand Up @@ -450,14 +458,11 @@ def _get_single_value(
return _singlevalue


def _parse_propertypath_set(serialized_path_set: str) -> frozenset[tuple[str, ...]]:
def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> frozenset[tuple[str, ...]]:
if FeatureFlag.objects.flag_is_up(FeatureFlag.PERIODIC_PROPERTYPATH):
# comma-delimited set of dot-delimited paths
return frozenset(
tuple(
osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
for _pathstep in _path.split(PROPERTYPATH_DELIMITER)
)
_parse_propertypath(_path, allow_globs=allow_globs)
for _path in split_queryparam_value(serialized_path_set)
)
# single comma-delimited path
Expand All @@ -468,6 +473,19 @@ def _parse_propertypath_set(serialized_path_set: str) -> frozenset[tuple[str, ..
return frozenset([_propertypath])


def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> tuple[str, ...]:
_path = tuple(
osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER)
)
if GLOB_PATHSTEP in _path:
if not allow_globs:
raise ValueError(f'no * allowed (got {serialized_path})')
if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path):
raise ValueError(f'path must be all * or no * (got {serialized_path})')
return _path


def _get_related_property_paths(filter_set) -> tuple[tuple[str]]:
# hard-coded for osf.io search pages, static list per type
# TODO: replace with some dynamism, maybe a 'significant_terms' aggregation
Expand Down

0 comments on commit 0076d76

Please sign in to comment.