From 72dac98d45f68f64a0a233189ebe01bec5288b12 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Sat, 13 Apr 2024 11:16:26 -0300 Subject: [PATCH] fix: store data in index to increase speed --- bd_api/apps/api/v1/models.py | 59 --------- bd_api/apps/api/v1/search_indexes.py | 187 +++++++++++++++++++++++---- bd_api/apps/api/v1/search_views.py | 115 ++++++++++++++-- 3 files changed, 270 insertions(+), 91 deletions(-) diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 97d945da..63f27784 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -353,13 +353,6 @@ class Meta: verbose_name_plural = "Tags" ordering = ["slug"] - @property - def as_search_result(self): - return { - "name": self.name, - "slug": self.slug, - } - class Theme(BaseModel): """Theme model""" @@ -383,13 +376,6 @@ class Meta: verbose_name_plural = "Themes" ordering = ["slug"] - @property - def as_search_result(self): - return { - "name": self.name, - "slug": self.slug, - } - class Organization(BaseModel): """Organization model""" @@ -440,17 +426,6 @@ def has_picture(self): return True return False - @property - def as_search_result(self): - return { - "id": self.pk, - "name": self.name, - "slug": self.slug, - "description": self.description, - "picture": getattr(self.picture, "name", None), - "website": self.website, - } - class Status(BaseModel): """Status model""" @@ -681,33 +656,6 @@ def raw_data_source_last_updated_at(self): ] # fmt: skip return max(updates) if updates else None - @property - def as_search_result(self): - return { - "updated_at": self.updated_at, - "id": self.id, - "slug": self.slug, - "name": self.name, - "temporal_coverage": [self.coverage], - "organization": [self.organization.as_search_result], - "tags": [t.as_search_result for t in self.tags.all()], - "themes": [t.as_search_result for t in self.themes.all()], - "entities": self.entities, - "contains_open_data": self.contains_open_data, - "contains_closed_data": self.contains_closed_data, - "contains_tables": self.contains_tables, - "contains_raw_data_sources": self.contains_raw_data_sources, - "contains_information_requests": self.contains_information_requests, - "n_tables": self.n_tables, - "n_raw_data_sources": self.n_raw_data_sources, - "n_information_requests": self.n_information_requests, - "first_table_id": self.first_table_id, - "first_open_table_id": self.first_open_table_id, - "first_closed_table_id": self.first_closed_table_id, - "first_raw_data_source_id": self.first_raw_data_source_id, - "first_information_request_id": self.first_information_request_id, - } - class Update(BaseModel): id = models.UUIDField(primary_key=True, default=uuid4) @@ -1488,13 +1436,6 @@ class Meta: verbose_name_plural = "Entities" ordering = ["slug"] - @property - def as_search_result(self): - return { - "name": self.name, - "slug": self.slug, - } - class ObservationLevel(BaseModel): """Model definition for ObservationLevel.""" diff --git a/bd_api/apps/api/v1/search_indexes.py b/bd_api/apps/api/v1/search_indexes.py index d59e1c38..9d5d7eea 100644 --- a/bd_api/apps/api/v1/search_indexes.py +++ b/bd_api/apps/api/v1/search_indexes.py @@ -7,40 +7,183 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable): text = indexes.CharField(document=True, use_template=True) - dataset = indexes.CharField(model_attr="slug", null=True, faceted=True) - dataset_name = indexes.CharField(model_attr="name", null=True) - dataset_description = indexes.CharField(model_attr="description", null=True) + updated_at = indexes.DateTimeField(model_attr="updated_at") - table = indexes.MultiValueField(model_attr="tables__slug", null=True, faceted=True) - table_names = indexes.MultiValueField(model_attr="tables__name", null=True) - table_descriptions = indexes.MultiValueField(model_attr="tables__description", null=True) + dataset_id = indexes.CharField( + model_attr="pk", + indexed=False, + ) + dataset_slug = indexes.CharField( + model_attr="slug", + indexed=False, + ) + dataset_name = indexes.CharField( + model_attr="name", + indexed=False, + ) + dataset_description = indexes.CharField( + model_attr="description", + default="", + indexed=False, + ) - organization = indexes.CharField(model_attr="organization__slug", null=True, faceted=True) - organization_names = indexes.CharField(model_attr="organization__name", null=True) - organization_descriptions = indexes.CharField(model_attr="organization__description", null=True) + table_id = indexes.MultiValueField( + model_attr="tables__pk", + indexed=False, + ) + table_slug = indexes.MultiValueField( + model_attr="tables__slug", + indexed=False, + ) + table_name = indexes.MultiValueField( + model_attr="tables__name", + indexed=False, + ) + table_description = indexes.MultiValueField( + model_attr="tables__description", + default="", + indexed=False, + ) - tag = indexes.MultiValueField(model_attr="tags__slug", null=True, faceted=True) - tag_names = indexes.MultiValueField(model_attr="tags__name", null=True) + organization_id = indexes.MultiValueField( + model_attr="organization__pk", + faceted=True, + indexed=False, + ) + organization_slug = indexes.MultiValueField( + model_attr="organization__slug", + faceted=True, + indexed=False, + ) + organization_name = indexes.MultiValueField( + model_attr="organization__name", + indexed=False, + ) + organization_picture = indexes.MultiValueField( + model_attr="organization__picture", + default="", + indexed=False, + ) + organization_website = indexes.MultiValueField( + model_attr="organization__website", + default="", + indexed=False, + ) + organization_description = indexes.MultiValueField( + model_attr="organization__description", + default="", + indexed=False, + ) - theme = indexes.MultiValueField(model_attr="themes__slug", null=True, faceted=True) - theme_names = indexes.MultiValueField(model_attr="themes__name", null=True) + tag_slug = indexes.MultiValueField( + model_attr="tags__slug", + default="", + faceted=True, + indexed=False, + ) + tag_name = indexes.MultiValueField( + model_attr="tags__name", + default="", + indexed=False, + ) - entity = indexes.MultiValueField( - model_attr="tables__observation_levels__entity__slug", null=True, faceted=True + theme_slug = indexes.MultiValueField( + model_attr="themes__slug", + default="", + faceted=True, + indexed=False, ) - entity_names = indexes.MultiValueField( - model_attr="tables__observation_levels__entity__name", null=True, faceted=True + theme_name = indexes.MultiValueField( + model_attr="themes__name", + default="", + indexed=False, ) - contains_open_data = indexes.BooleanField(model_attr="contains_open_data") - contains_closed_data = indexes.BooleanField(model_attr="contains_closed_data") + entity_slug = indexes.MultiValueField( + model_attr="tables__observation_levels__entity__slug", + default="", + faceted=True, + indexed=False, + ) + entity_name = indexes.MultiValueField( + model_attr="tables__observation_levels__entity__name", + default="", + faceted=True, + indexed=False, + ) - contains_tables = indexes.BooleanField(model_attr="contains_tables") - contains_raw_data_sources = indexes.BooleanField(model_attr="contains_raw_data_sources") - contains_information_requests = indexes.BooleanField(model_attr="contains_information_requests") + temporal_coverage = indexes.MultiValueField( + default="", + model_attr="coverage", + indexed=False, + ) + + contains_open_data = indexes.BooleanField( + model_attr="contains_open_data", + indexed=False, + ) + contains_closed_data = indexes.BooleanField( + model_attr="contains_closed_data", + indexed=False, + ) + + contains_tables = indexes.BooleanField( + model_attr="contains_tables", + indexed=False, + ) + contains_raw_data_sources = indexes.BooleanField( + model_attr="contains_raw_data_sources", + indexed=False, + ) + contains_information_requests = indexes.BooleanField( + model_attr="contains_information_requests", + indexed=False, + ) + + n_tables = indexes.IntegerField( + model_attr="n_tables", + indexed=False, + ) + n_raw_data_sources = indexes.IntegerField( + model_attr="n_raw_data_sources", + indexed=False, + ) + n_information_requests = indexes.IntegerField( + model_attr="n_information_requests", + indexed=False, + ) + + first_table_id = indexes.CharField( + model_attr="first_table_id", + default="", + indexed=False, + ) + first_open_table_id = indexes.CharField( + model_attr="first_open_table_id", + default="", + indexed=False, + ) + first_closed_table_id = indexes.CharField( + model_attr="first_closed_table_id", + default="", + indexed=False, + ) + first_raw_data_source_id = indexes.CharField( + model_attr="first_raw_data_source_id", + default="", + indexed=False, + ) + first_information_request_id = indexes.CharField( + model_attr="first_information_request_id", + default="", + indexed=False, + ) def get_model(self): return Dataset def index_queryset(self, using=None): return self.get_model().objects.exclude(status__slug="under_review").all() + + def prepare_organization_picture(self, obj): + return getattr(obj.organization.picture, "name", None) diff --git a/bd_api/apps/api/v1/search_views.py b/bd_api/apps/api/v1/search_views.py index ce37f900..05cc6830 100644 --- a/bd_api/apps/api/v1/search_views.py +++ b/bd_api/apps/api/v1/search_views.py @@ -2,6 +2,7 @@ from django.http import JsonResponse from haystack.forms import FacetedSearchForm from haystack.generic_views import FacetedSearchView +from haystack.models import SearchResult from haystack.query import SearchQuerySet from bd_api.apps.api.v1.models import Entity, Organization, Tag, Theme @@ -17,10 +18,10 @@ def no_query_found(self): class DatasetSearchView(FacetedSearchView): form_class = DatasetSearchForm facet_fields = [ - "tag", - "theme", - "entity", - "organization", + "tag_slug", + "theme_slug", + "entity_slug", + "organization_slug", "contains_open_data", "contains_closed_data", "contains_tables", @@ -28,12 +29,28 @@ class DatasetSearchView(FacetedSearchView): "contains_information_requests", ] + @property + def page(self): + try: + return int(self.request.GET.get("page", 1)) + except (TypeError, ValueError): + return 1 + + @property + def page_size(self): + try: + return int(self.request.GET.get("page_size", 10)) + except (TypeError, ValueError): + return 10 + def get(self, request, *args, **kwargs): if form := self.get_form(): if sqs := form.search(): return JsonResponse( { "count": sqs.count(), + "page": self.page, + "page_size": self.page_size, "results": self.get_results(sqs), "aggregations": self.get_facets(sqs), } @@ -51,20 +68,98 @@ def get_facets(self, sqs: SearchQuerySet): } ) for key, model in [ - ("tag", Tag), - ("theme", Theme), - ("entity", Entity), - ("organization", Organization), + ("tag_slug", Tag), + ("theme_slug", Theme), + ("entity_slug", Entity), + ("organization_slug", Organization), ]: m = model.objects.values("slug", "name") m = {mi["slug"]: mi["name"] for mi in m.all()} for field in facets[key]: - field["name"] = m[field["key"]] + field["name"] = m.get(field["key"], "") return facets def get_results(self, sqs: SearchQuerySet): def key(r): return (r.contains_tables, r.score) + until = self.page * self.page_size + since = (self.page - 1) * self.page_size + results = sorted(sqs.all(), key=key, reverse=True) - return [r.object.as_search_result for r in results] + return [as_search_result(r) for r in results[since:until]] + + +def as_search_result(result: SearchResult): + tag = [] + for slug, name in zip(result.tag_slug, result.tag_name): + tag.append( + { + "slug": slug, + "name": name, + } + ) + + theme = [] + for slug, name in zip(result.theme_slug, result.theme_name): + theme.append( + { + "slug": slug, + "name": name, + } + ) + + entity = [] + for slug, name in zip(result.entity_slug, result.entity_name): + entity.append( + { + "slug": slug, + "name": name, + } + ) + + organization = [] + for pk, slug, name, picture, website, description in zip( + result.organization_id, + result.organization_slug, + result.organization_name, + result.organization_picture, + result.organization_website, + result.organization_description, + ): + organization.append( + { + "id": pk, + "slug": slug, + "name": name, + "picture": picture, + "website": website, + "description": description, + } + ) + + return { + "updated_at": result.updated_at, + "id": result.dataset_id, + "slug": result.dataset_slug, + "name": result.dataset_name, + "description": result.dataset_description, + "tags": tag, + "themes": theme, + "entities": entity, + "organizations": organization, + "temporal_coverages": result.temporal_coverage, + "contains_open_data": result.contains_open_data, + "contains_closed_data": result.contains_closed_data, + "contains_tables": result.contains_tables, + "contains_raw_data_sources": result.contains_raw_data_sources, + "contains_information_requests": result.contains_information_requests, + "n_tables": result.n_tables, + "n_raw_data_sources": result.n_raw_data_sources, + "n_information_requests": result.n_information_requests, + "first_table_id": result.first_table_id, + "first_open_table_id": result.first_open_table_id, + "first_closed_table_id": result.first_closed_table_id, + "first_raw_data_source_id": result.first_raw_data_source_id, + "first_information_request_id": result.first_information_request_id, + }