wip

aaxelb · Oct 12, 2023 · 0076d76 · 0076d76
1 parent fccca83
commit 0076d76
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 81 deletions.
diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py
@@ -25,6 +25,7 @@
     Textsegment,
     SortParam,
     PageParam,
+    GLOB_PATHSTEP,
 )
 from share.search.search_response import (
     CardsearchResponse,
@@ -398,7 +399,6 @@ def _cardsearch_query(
                 else:
                     raise ValueError(f'unknown filter operator {_searchfilter.operator}')
             _textq_builder = self._NestedTextQueryBuilder(
-                inner_hits_factory=self._cardsearch_inner_hits,
                 relevance_matters=bool(cardsearch_cursor and not cardsearch_cursor.random_sort),
             )
             for _textsegment in textsegment_set:
@@ -434,23 +434,6 @@ def _cardsearch_query(
                 },
             }
 
-        def _cardsearch_inner_hits(self, *, highlight_query=None) -> dict:
-            _highlight = {
-                'type': 'unified',
-                'fields': {'nested_text.text_value': {}},
-            }
-            if highlight_query is not None:
-                _highlight['highlight_query'] = highlight_query
-            return {
-                'name': str(uuid.uuid4()),  # avoid inner-hit name collisions
-                'highlight': _highlight,
-                '_source': False,  # _source is expensive for nested docs
-                'docvalue_fields': [
-                    'nested_text.path_from_focus',
-                    'nested_text.language_iri',
-                ],
-            }
-
         def _cardsearch_aggs(self, cardsearch_params):
             _aggs = {}
             if cardsearch_params.related_property_paths:
@@ -650,9 +633,17 @@ def _cardsearch_presence_query(self, search_filter) -> dict:
             ]
             if len(_filters) == 1:
                 return _filters[0]
-            return {'bool': {'must': _filters}}
+            return {'bool': {
+                'min_should_match': 1,
+                'should': _filters,
+            }}
 
         def _cardsearch_path_presence_query(self, path: tuple[str, ...]):
+            if all(_pathstep == GLOB_PATHSTEP for _pathstep in path):
+                return {'nested': {
+                    'path': 'nested_iri',
+                    'query': {'term': {'nested_iri.distance_from_focus': len(path)}},
+                }}
             return {'term': {
                 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True),
             }}
@@ -664,44 +655,58 @@ def _cardsearch_iri_filter(self, search_filter) -> dict:
             ]
             if len(_filters) == 1:
                 return _filters[0]
-            return {'bool': {'should': _filters}}  # at least one match
+            return {'bool': {
+                'min_should_match': 1,
+                'should': _filters,
+            }}
 
         def _cardsearch_path_iri_query(self, path, value_set):
-            return {'terms': {_iri_path_as_flattened_field(path): [
+            _suffuniq_values = [
                 get_sufficiently_unique_iri(_iri)
                 for _iri in value_set
-            ]}}
-
-        def _cardsearch_date_filter(self, search_filter) -> dict:
-            _filter_list = [
-                {'terms': {'nested_date.suffuniq_path_from_focus': [
-                    iri_path_as_keyword(_path, suffuniq=True)
-                    for _path in search_filter.propertypath_set
-                ]}},
             ]
+            if all(_pathstep == GLOB_PATHSTEP for _pathstep in path):
+                return {'nested': {
+                    'path': 'nested_iri',
+                    'query': {'bool': {
+                        'must': [  # both
+                            {'term': {'nested_iri.distance_from_focus': len(path)}},
+                            {'terms': {'nested_iri.flat_iri_values_suffuniq': _suffuniq_values}},
+                        ],
+                    }},
+                }}
+            # without a glob-path, can use the flattened keyword field
+            return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}}
+
+        def _cardsearch_date_filter(self, search_filter):
+            return {'nested': {
+                'path': 'nested_date',
+                'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}},
+            }}
+
+        def _iter_nested_date_filters(self, search_filter) -> dict:
+            # filter by requested paths
+            yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date')
+            # filter by requested value/operator
             if search_filter.operator == SearchFilter.FilterOperator.BEFORE:
                 _value = min(search_filter.value_set)  # rely on string-comparable isoformat
-                _filter_list.append({'range': {'nested_date.date_value': {
+                yield {'range': {'nested_date.date_value': {
                     'lt': _daterange_value_and_format(_value)
-                }}})
+                }}}
             elif search_filter.operator == SearchFilter.FilterOperator.AFTER:
                 _value = max(search_filter.value_set)  # rely on string-comparable isoformat
-                _filter_list.append({'range': {'nested_date.date_value': {
+                yield {'range': {'nested_date.date_value': {
                     'gt': _daterange_value_and_format(_value)
-                }}})
+                }}}
             elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE:
                 for _value in search_filter.value_set:
                     _filtervalue = _daterange_value_and_format(_value)
-                    _filter_list.append({'range': {'nested_date.date_value': {
+                    yield {'range': {'nested_date.date_value': {
                         'gte': _filtervalue,
                         'lte': _filtervalue,
-                    }}})
+                    }}}
             else:
                 raise ValueError(f'invalid date filter operator (got {search_filter.operator})')
-            return {'nested': {
-                'path': 'nested_date',
-                'query': {'bool': {'filter': _filter_list}},
-            }}
 
         def _cardsearch_sort(self, sort_list: tuple[SortParam]):
             if not sort_list:
@@ -860,9 +865,8 @@ def fuzzy_text_should_query(self, text: str):
                 }}
 
         class _NestedTextQueryBuilder(_SimpleTextQueryBuilder):
-            def __init__(self, inner_hits_factory=None, **kwargs):
+            def __init__(self, **kwargs):
                 super().__init__('nested_text.text_value', **kwargs)
-                self._inner_hits_factory = inner_hits_factory
 
             def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]:
                 return {
@@ -873,30 +877,35 @@ def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]:
                     for _boolkey, _queries in super().textsegment_boolparts(textsegment).items()
                 }
 
-            def _textsegment_filter(self, textsegment):
-                if textsegment.propertypath_set:
-                    return {'terms': {
-                        'nested_text.suffuniq_path_from_focus': [
-                            iri_path_as_keyword(_path, suffuniq=True)
-                            for _path in textsegment.propertypath_set
-                        ],
-                    }}
-                return {'term': {
-                    'nested_text.distance_from_focus': textsegment.propertypath_depth,
-                }}
-
-            def _make_nested_query(self, textsegment, query, *, with_inner_hits=False):
+            def _make_nested_query(self, textsegment, query):
                 _nested_q = {'nested': {
                     'path': self._nested_path,
                     'query': {'bool': {
-                        'filter': self._textsegment_filter(textsegment),
+                        'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'),
                         'must': query,
                     }},
                 }}
-                if with_inner_hits and self._inner_hits_factory:
-                    _nested_q['nested']['inner_hits'] = self._inner_hits_factory()
+                if self._relevance_matters:
+                    _nested_q['nested']['inner_hits'] = self._inner_hits()
                 return _nested_q
 
+            def _inner_hits(self, *, highlight_query=None) -> dict:
+                _highlight = {
+                    'type': 'unified',
+                    'fields': {'nested_text.text_value': {}},
+                }
+                if highlight_query is not None:
+                    _highlight['highlight_query'] = highlight_query
+                return {
+                    'name': str(uuid.uuid4()),  # avoid inner-hit name collisions
+                    'highlight': _highlight,
+                    '_source': False,  # _source is expensive for nested docs
+                    'docvalue_fields': [
+                        'nested_text.path_from_focus',
+                        'nested_text.language_iri',
+                    ],
+                }
+
 
 ###
 # module-local utils
@@ -942,6 +951,27 @@ def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str:
     return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}'
 
 
+def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], nested_path: str):
+    _suffuniq_iri_paths = []
+    _glob_path_lengths = []
+    for _path in propertypath_set:
+        if all(_pathstep == GLOB_PATHSTEP for _pathstep in _path):
+            _glob_path_lengths.append(len(_path))
+        else:
+            _suffuniq_iri_paths.append(iri_path_as_keyword(_path, suffuniq=True))
+    if _suffuniq_iri_paths and _glob_path_lengths:
+        return {'bool': {
+            'min_should_match': 1,
+            'should': [
+                {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}},
+                {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}},
+            ],
+        }}
+    if _glob_path_lengths:
+        return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}
+    return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}
+
+
 @dataclasses.dataclass
 class _SimpleCursor:
     start_index: int

diff --git a/share/search/search_params.py b/share/search/search_params.py
@@ -29,28 +29,37 @@
 
 
 ###
-# special characters in search text:
+# constants for use in query param parsing
+
+# special characters in "...SearchText" values
 NEGATE_WORD_OR_PHRASE = '-'
 DOUBLE_QUOTATION_MARK = '"'
 
+# optional prefix for "sort" values
 DESCENDING_SORT_PREFIX = '-'
 
+# for "page[size]" values
 DEFAULT_PAGE_SIZE = 13
 MAX_PAGE_SIZE = 101
+
+# between each step in a property path "foo.bar.baz"
 PROPERTYPATH_DELIMITER = '.'
 
+# special path-step that matches all properties
+GLOB_PATHSTEP = '*'
+
 
 ###
 # dataclasses for parsed search-api query parameters
 
+
 @dataclasses.dataclass(frozen=True)
 class Textsegment:
     text: str
     is_fuzzy: bool = True
     is_negated: bool = False
     is_openended: bool = False
-    propertypath_depth: int = 1  # ignored if propertypath_set
-    propertypath_set: None | frozenset[tuple[str, ...]] = None
+    propertypath_set: frozenset[tuple[str, ...]] = frozenset((GLOB_PATHSTEP,))
 
     def __post_init__(self):
         if self.is_negated and self.is_fuzzy:
@@ -61,24 +70,23 @@ def words(self):
 
     @classmethod
     def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str):
-        return frozenset(
-            cls.from_searchtext_param(param_name, param_value)
-            for (param_name, param_value)
-            in queryparams.get(queryparam_family, ())
-        )
+        return frozenset(cls.iter_from_queryparam_family(queryparams, queryparam_family))
 
     @classmethod
-    def from_searchtext_param(cls, param_name: QueryparamName, param_value: str):
-        if param_name.bracketed_names:
-            _propertypath_set = _parse_propertypath_set(param_name.bracketed_names[0])
-            return cls(
+    def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str):
+        for (_param_name, _param_value) in queryparams.get(queryparam_family, ()):
+            yield from cls.iter_from_searchtext_param(_param_name, _param_value)
 
     @classmethod
-    def split_str(cls, text: str) -> frozenset['Textsegment']:
-        return frozenset(cls._split_str(text))
+    def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str):
+        if param_name.bracketed_names:
+            yield from cls.iter_from_text(
+                param_value,
+                propertypath_set=_parse_propertypath_set(param_name.bracketed_names[0]),
+            )
 
     @classmethod
-    def _split_str(cls, text: str) -> typing.Iterable['Textsegment']:
+    def iter_from_text(cls, text: str, **textsegment_kwargs) -> typing.Iterable['Textsegment']:
         '''parse search text into words and quoted phrases
         '''
         _in_quotes = False
@@ -184,9 +192,9 @@ def is_iri_operator(self):
         def is_valueless_operator(self):
             return self in (self.IS_PRESENT, self.IS_ABSENT)
 
-    propertypath_set: frozenset[tuple[str, ...]]
-    value_set: frozenset[str]
     operator: FilterOperator
+    value_set: frozenset[str]
+    propertypath_set: frozenset[tuple[str, ...]] = frozenset((GLOB_PATHSTEP,))
     original_param_name: typing.Optional[str] = None
     original_param_value: typing.Optional[str] = None
 
@@ -243,9 +251,9 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str):
                     else:
                         _value_list.append(_iri)
         return cls(
-            propertypath_set=_propertypath_set,
             value_set=frozenset(_value_list),
             operator=_operator,
+            propertypath_set=_propertypath_set,
             original_param_name=str(param_name),
             original_param_value=param_value,
         )
@@ -390,7 +398,7 @@ def from_queryparams(queryparams: QueryparamDict) -> 'ValuesearchParams':
             raise ValueError('TODO: 400 valueSearchPropertyPath required')
         return ValuesearchParams(
             **CardsearchParams.parse_cardsearch_queryparams(queryparams),
-            valuesearch_propertypath_set=_parse_propertypath_set(_raw_propertypath),
+            valuesearch_propertypath_set=_parse_propertypath_set(_raw_propertypath, allow_globs=False),
             valuesearch_text=_valuesearch_text,
             valuesearch_textsegment_set=Textsegment.from_queryparam_family(queryparams, 'valueSearchText'),
             valuesearch_filter_set=SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'),
@@ -450,14 +458,11 @@ def _get_single_value(
         return _singlevalue
 
 
-def _parse_propertypath_set(serialized_path_set: str) -> frozenset[tuple[str, ...]]:
+def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> frozenset[tuple[str, ...]]:
     if FeatureFlag.objects.flag_is_up(FeatureFlag.PERIODIC_PROPERTYPATH):
         # comma-delimited set of dot-delimited paths
         return frozenset(
-            tuple(
-                osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
-                for _pathstep in _path.split(PROPERTYPATH_DELIMITER)
-            )
+            _parse_propertypath(_path, allow_globs=allow_globs)
             for _path in split_queryparam_value(serialized_path_set)
         )
     # single comma-delimited path
@@ -468,6 +473,19 @@ def _parse_propertypath_set(serialized_path_set: str) -> frozenset[tuple[str, ..
     return frozenset([_propertypath])
 
 
+def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> tuple[str, ...]:
+    _path = tuple(
+        osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
+        for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER)
+    )
+    if GLOB_PATHSTEP in _path:
+        if not allow_globs:
+            raise ValueError(f'no * allowed (got {serialized_path})')
+        if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path):
+            raise ValueError(f'path must be all * or no * (got {serialized_path})')
+    return _path
+
+
 def _get_related_property_paths(filter_set) -> tuple[tuple[str]]:
     # hard-coded for osf.io search pages, static list per type
     # TODO: replace with some dynamism, maybe a 'significant_terms' aggregation