From 586fd5df9295e67fb6349d3011491b93a637dda0 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Wed, 17 Apr 2024 19:15:24 -0300 Subject: [PATCH] feat: improve search with ngram and snowball --- bd_api/apps/api/v1/search_engines.py | 48 +++++++++++++++++++++------- bd_api/apps/api/v1/search_indexes.py | 2 +- bd_api/apps/api/v1/search_views.py | 20 ++++++++++-- 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/bd_api/apps/api/v1/search_engines.py b/bd_api/apps/api/v1/search_engines.py index 1fc6f1b7..de601b6d 100644 --- a/bd_api/apps/api/v1/search_engines.py +++ b/bd_api/apps/api/v1/search_engines.py @@ -15,23 +15,41 @@ class ASCIIFoldingElasticBackend(es_backend.Elasticsearch7SearchBackend, metacla def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) analyzer = { - "ascii_analyzer": { - "type": "custom", - "tokenizer": "standard", + "ngram": { + "tokenizer": "ngram", "filter": ["asciifolding", "lowercase"], }, - "ngram_analyzer": { - "type": "custom", - "tokenizer": "lowercase", - "filter": ["asciifolding", "haystack_ngram"], + "edgengram": { + "tokenizer": "edgengram", + "filter": ["asciifolding", "lowercase"], + }, + "snowball_en": { + "type": "snowball", + "language": "English", + "filter": ["asciifolding"], + }, + "snowball_pt": { + "type": "snowball", + "language": "Portuguese", + "filter": ["asciifolding"], + }, + } + tokenizer = { + "ngram": { + "type": "ngram", + "min_gram": 3, + "max_gram": 5, + "token_chars": ["letter", "digit"], }, - "edgengram_analyzer": { - "type": "custom", - "tokenizer": "lowercase", - "filter": ["asciifolding", "haystack_edgengram"], + "edgengram": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 15, + "token_chars": ["letter", "digit"], }, } self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer + self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer def build_schema(self, fields): content_field_name, mapping = super().build_schema(fields) @@ -40,7 +58,13 @@ def build_schema(self, fields): if field_mapping["type"] == "text" and field_class.indexed: if not hasattr(field_class, "facet_for"): if field_class.field_type not in ("ngram", "edge_ngram"): - field_mapping["analyzer"] = "ascii_analyzer" + field_mapping["analyzer"] = "standard" + field_mapping["fields"] = { + "ngram": {"type": "text", "analyzer": "ngram"}, + "edgengram": {"type": "text", "analyzer": "edgengram"}, + "snowball_pt": {"type": "text", "analyzer": "snowball_pt"}, + "snowball_en": {"type": "text", "analyzer": "snowball_en"}, + } mapping.update({field_class.index_fieldname: field_mapping}) return (content_field_name, mapping) diff --git a/bd_api/apps/api/v1/search_indexes.py b/bd_api/apps/api/v1/search_indexes.py index 9d5d7eea..e80b5b4b 100644 --- a/bd_api/apps/api/v1/search_indexes.py +++ b/bd_api/apps/api/v1/search_indexes.py @@ -183,7 +183,7 @@ def get_model(self): return Dataset def index_queryset(self, using=None): - return self.get_model().objects.exclude(status__slug="under_review").all() + return self.get_model().objects.exclude(status__slug="under_review") def prepare_organization_picture(self, obj): return getattr(obj.organization.picture, "name", None) diff --git a/bd_api/apps/api/v1/search_views.py b/bd_api/apps/api/v1/search_views.py index 62c99bd9..995d2153 100644 --- a/bd_api/apps/api/v1/search_views.py +++ b/bd_api/apps/api/v1/search_views.py @@ -21,9 +21,24 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def search(self): - sqs = super().search() + if not self.is_valid(): + return self.no_query_found() + + if not (q := self.cleaned_data.get("q")): + return self.no_query_found() + + sqs = ( + self.searchqueryset + .auto_query(q) + .filter_or(**{"text.ngram": q}) + .filter_or(**{"text.edgengram": q}) + .filter_or(**{"text.snowball_pt": q}) + .filter_or(**{"text.snowball_en": q}) + ) # fmt: skip + for qp_value in self.contains: sqs = sqs.narrow(f'contains_{qp_value}:"true"') + for qp_key, facet_key in [ ("tag", "tag_slug"), ("theme", "theme_slug"), @@ -32,6 +47,7 @@ def search(self): ]: for qp_value in getattr(self, qp_key, []): sqs = sqs.narrow(f'{facet_key}:"{sqs.query.clean(qp_value)}"') + return sqs def no_query_found(self): @@ -113,7 +129,7 @@ def get_facets(self, sqs: SearchQuerySet): def get_results(self, sqs: SearchQuerySet): def key(r): - return (r.contains_tables, r.score) + return (r.n_tables, r.score) until = self.page * self.page_size since = (self.page - 1) * self.page_size