Skip to content

Commit

Permalink
feat: improve search with ngram and snowball
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Apr 17, 2024
1 parent f99adb1 commit 586fd5d
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 15 deletions.
48 changes: 36 additions & 12 deletions bd_api/apps/api/v1/search_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,41 @@ class ASCIIFoldingElasticBackend(es_backend.Elasticsearch7SearchBackend, metacla
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
analyzer = {
"ascii_analyzer": {
"type": "custom",
"tokenizer": "standard",
"ngram": {
"tokenizer": "ngram",
"filter": ["asciifolding", "lowercase"],
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["asciifolding", "haystack_ngram"],
"edgengram": {
"tokenizer": "edgengram",
"filter": ["asciifolding", "lowercase"],
},
"snowball_en": {
"type": "snowball",
"language": "English",
"filter": ["asciifolding"],
},
"snowball_pt": {
"type": "snowball",
"language": "Portuguese",
"filter": ["asciifolding"],
},
}
tokenizer = {
"ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 5,
"token_chars": ["letter", "digit"],
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["asciifolding", "haystack_edgengram"],
"edgengram": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 15,
"token_chars": ["letter", "digit"],
},
}
self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer
self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer

def build_schema(self, fields):
content_field_name, mapping = super().build_schema(fields)
Expand All @@ -40,7 +58,13 @@ def build_schema(self, fields):
if field_mapping["type"] == "text" and field_class.indexed:
if not hasattr(field_class, "facet_for"):
if field_class.field_type not in ("ngram", "edge_ngram"):
field_mapping["analyzer"] = "ascii_analyzer"
field_mapping["analyzer"] = "standard"
field_mapping["fields"] = {
"ngram": {"type": "text", "analyzer": "ngram"},
"edgengram": {"type": "text", "analyzer": "edgengram"},
"snowball_pt": {"type": "text", "analyzer": "snowball_pt"},
"snowball_en": {"type": "text", "analyzer": "snowball_en"},
}
mapping.update({field_class.index_fieldname: field_mapping})
return (content_field_name, mapping)

Expand Down
2 changes: 1 addition & 1 deletion bd_api/apps/api/v1/search_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def get_model(self):
return Dataset

def index_queryset(self, using=None):
return self.get_model().objects.exclude(status__slug="under_review").all()
return self.get_model().objects.exclude(status__slug="under_review")

def prepare_organization_picture(self, obj):
return getattr(obj.organization.picture, "name", None)
20 changes: 18 additions & 2 deletions bd_api/apps/api/v1/search_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,24 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def search(self):
sqs = super().search()
if not self.is_valid():
return self.no_query_found()

if not (q := self.cleaned_data.get("q")):
return self.no_query_found()

sqs = (
self.searchqueryset
.auto_query(q)
.filter_or(**{"text.ngram": q})
.filter_or(**{"text.edgengram": q})
.filter_or(**{"text.snowball_pt": q})
.filter_or(**{"text.snowball_en": q})
) # fmt: skip

for qp_value in self.contains:
sqs = sqs.narrow(f'contains_{qp_value}:"true"')

for qp_key, facet_key in [
("tag", "tag_slug"),
("theme", "theme_slug"),
Expand All @@ -32,6 +47,7 @@ def search(self):
]:
for qp_value in getattr(self, qp_key, []):
sqs = sqs.narrow(f'{facet_key}:"{sqs.query.clean(qp_value)}"')

return sqs

def no_query_found(self):
Expand Down Expand Up @@ -113,7 +129,7 @@ def get_facets(self, sqs: SearchQuerySet):

def get_results(self, sqs: SearchQuerySet):
def key(r):
return (r.contains_tables, r.score)
return (r.n_tables, r.score)

until = self.page * self.page_size
since = (self.page - 1) * self.page_size
Expand Down

0 comments on commit 586fd5d

Please sign in to comment.