Skip to content

Commit

Permalink
feat: update search engine, index and view
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna committed Apr 7, 2024
1 parent 56882b5 commit 1e4c2bb
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 836 deletions.
163 changes: 137 additions & 26 deletions bd_api/apps/api/v1/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from django.core.exceptions import ValidationError
from django.db import models
from django.urls import reverse
from ordered_model.models import OrderedModel

from bd_api.apps.account.models import Account
Expand Down Expand Up @@ -354,6 +353,13 @@ class Meta:
verbose_name_plural = "Tags"
ordering = ["slug"]

@property
def as_search_result(self):
return {
"name": self.name,
"slug": self.slug,
}


class Theme(BaseModel):
"""Theme model"""
Expand All @@ -377,6 +383,13 @@ class Meta:
verbose_name_plural = "Themes"
ordering = ["slug"]

@property
def as_search_result(self):
return {
"name": self.name,
"slug": self.slug,
}


class Organization(BaseModel):
"""Organization model"""
Expand Down Expand Up @@ -427,6 +440,17 @@ def has_picture(self):
return True
return False

@property
def as_search_result(self):
return {
"id": self.pk,
"name": self.name,
"slug": self.slug,
"description": self.description,
"picture": getattr(self.picture, "name", None),
"website": self.website,
}


class Status(BaseModel):
"""Status model"""
Expand Down Expand Up @@ -503,10 +527,6 @@ class Meta:
verbose_name_plural = "Datasets"
ordering = ["slug"]

def get_success_url(self):
"""Get the success url for the dataset"""
return reverse("datasetdetail", kwargs={"pk": self.object.pk})

@property
def full_slug(self):
if self.organization.area.slug != "unknown":
Expand All @@ -524,12 +544,12 @@ def popularity(self):
@property
def coverage(self) -> dict:
"""Temporal coverage of all related entities"""
entities = [
resources = [
*self.tables.all(),
*self.raw_data_sources.all(),
*self.information_requests.all(),
]
coverage = get_coverage(entities)
coverage = get_coverage(resources)
if coverage["start"] and coverage["end"]:
return f"{coverage['start']} - {coverage['end']}"
if coverage["start"]:
Expand All @@ -538,6 +558,20 @@ def coverage(self) -> dict:
return f"{coverage['end']}"
return ""

@property
def entities(self) -> list[dict]:
"""Entity of all related resources"""
entities = []
resources = [
*self.tables.all(),
*self.raw_data_sources.all(),
*self.information_requests.all(),
]
for resource in resources:
for observation in resource.observation_levels.all():
entities.append(observation.entity.as_search_result)
return entities

@property
def contains_open_data(self):
"""Returns true if there are tables or columns with open coverages"""
Expand Down Expand Up @@ -582,6 +616,45 @@ def contains_information_requests(self):
"""Returns true if there are information requests in the dataset"""
return len(self.information_requests.all()) > 0

@property
def n_tables(self):
return len(self.tables.all())

@property
def n_raw_data_sources(self):
return len(self.raw_data_sources.all())

@property
def n_information_requests(self):
return len(self.information_requests.all())

@property
def first_table_id(self):
if resource := self.tables.order_by("order").first():
return resource.pk

@property
def first_open_table_id(self):
for resource in self.tables.order_by("order").all():
if resource.contains_open_data:
return resource.pk

@property
def first_closed_table_id(self):
for resource in self.tables.order_by("order").all():
if resource.contains_closed_data:
return resource.pk

@property
def first_raw_data_source_id(self):
if resource := self.raw_data_sources.order_by("order").first():
return resource.pk

@property
def first_information_request_id(self):
if resource := self.information_requests.order_by("order").first():
return resource.pk

@property
def table_last_updated_at(self):
updates = [
Expand All @@ -598,6 +671,33 @@ def raw_data_source_last_updated_at(self):
] # fmt: skip
return max(updates) if updates else None

@property
def as_search_result(self):
return {
"updated_at": self.updated_at,
"id": self.id,
"slug": self.slug,
"name": self.name,
"temporal_coverage": [self.coverage],
"organization": [self.organization.as_search_result],
"tags": [t.as_search_result for t in self.tags.all()],
"themes": [t.as_search_result for t in self.themes.all()],
"entities": self.entities,
"contains_open_data": self.contains_open_data,
"contains_closed_data": self.contains_closed_data,
"contains_tables": self.contains_tables,
"contains_raw_data_sources": self.contains_raw_data_sources,
"contains_information_requests": self.contains_information_requests,
"n_tables": self.n_tables,
"n_raw_data_sources": self.n_raw_data_sources,
"n_information_requests": self.n_information_requests,
"first_table_id": self.first_table_id,
"first_open_table_id": self.first_open_table_id,
"first_closed_table_id": self.first_closed_table_id,
"first_raw_data_source_id": self.first_raw_data_source_id,
"first_information_request_id": self.first_information_request_id,
}


class Update(BaseModel):
id = models.UUIDField(primary_key=True, default=uuid4)
Expand Down Expand Up @@ -769,18 +869,22 @@ def partitions(self):
return ", ".join(partitions_list)

@property
def contains_closed_data(self):
"""Returns true if there are columns with closed coverages"""
closed_data = False
table_coverages = self.coverages.filter(is_closed=True)
if table_coverages:
closed_data = True
for column in self.columns.all(): # in the future it will be column.coverages
if column.is_closed:
closed_data = True
break
def contains_open_data(self):
if self.coverages.filter(is_closed=False):
return True
for column in self.columns.all():
if column.coverages.filter(is_closed=False).first():
return True
return False

return closed_data
@property
def contains_closed_data(self):
if self.coverages.filter(is_closed=True).first():
return True
for column in self.columns.all():
if column.coverages.filter(is_closed=True).first():
return True
return False

@property
def coverage(self) -> dict:
Expand Down Expand Up @@ -1374,6 +1478,13 @@ class Meta:
verbose_name_plural = "Entities"
ordering = ["slug"]

@property
def as_search_result(self):
return {
"name": self.name,
"slug": self.slug,
}


class ObservationLevel(BaseModel):
"""Model definition for ObservationLevel."""
Expand Down Expand Up @@ -1670,16 +1781,16 @@ def as_dict(self):
return {"date": self.str, "type": self.type}


def get_coverage(entities: list) -> dict:
"""Get maximum datetime coverage of entities
def get_coverage(resources: list) -> dict:
"""Get maximum datetime coverage of resources
Case:
- Table A has data with dates between [X, Y]
"""
since = Date(datetime.max, None, None)
until = Date(datetime.min, None, None)
for entity in entities:
for cov in entity.coverages.all():
for resource in resources:
for cov in resource.coverages.all():
for dt in cov.datetime_ranges.all():
if dt.since and dt.since < since.dt:
since.dt = dt.since
Expand All @@ -1690,8 +1801,8 @@ def get_coverage(entities: list) -> dict:
return {"start": since.str, "end": until.str}


def get_full_coverage(entities: list) -> dict:
"""Get datetime coverage steps of entities
def get_full_coverage(resources: list) -> dict:
"""Get datetime coverage steps of resources
Cases:
- Table A has data with dates between [X, Y], where [X, Y] is open
Expand All @@ -1702,8 +1813,8 @@ def get_full_coverage(entities: list) -> dict:
open_until = Date(datetime.min, None, "open")
paid_since = Date(datetime.max, None, "closed")
paid_until = Date(datetime.min, None, "closed")
for entity in entities:
for cov in entity.coverages.all():
for resource in resources:
for cov in resource.coverages.all():
for dt in cov.datetime_ranges.all():
if not cov.is_closed:
if dt.since and dt.since < open_since.dt:
Expand Down
77 changes: 9 additions & 68 deletions bd_api/apps/api/v1/search_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,93 +13,34 @@

class ASCIIFoldingElasticBackend(es_backend.Elasticsearch7SearchBackend, metaclass=ABCMeta):
def __init__(self, *args, **kwargs):
super(ASCIIFoldingElasticBackend, self).__init__(*args, **kwargs)
super().__init__(*args, **kwargs)
analyzer = {
"ascii_ngram_analyser": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase", "haystack_edgengram"],
},
"standard_analyzer": {
"ascii_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase"],
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["asciifolding", "lowercase", "haystack_ngram"],
"tokenizer": "lowercase",
"filter": ["asciifolding", "haystack_ngram"],
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "my_tokenizer",
"filter": ["asciifolding", "lowercase"],
},
}
tokenizer = {
"standard": {"type": "standard"},
"lowercase": {"type": "lowercase"},
"my_tokenizer": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 15,
"token_chars": ["letter", "digit"],
"tokenizer": "lowercase",
"filter": ["asciifolding", "haystack_edgengram"],
},
}
filter = {
"haystack_ngram": {
"type": "ngram",
"min_gram": 4,
"max_gram": 5,
},
"haystack_edgengram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 15,
},
}

self.DEFAULT_SETTINGS["settings"]["analysis"]["tokenizer"] = tokenizer
self.DEFAULT_SETTINGS["settings"]["analysis"]["analyzer"] = analyzer
self.DEFAULT_SETTINGS["settings"]["analysis"]["filter"] = filter

def build_schema(self, fields):
content_field_name, mapping = super(ASCIIFoldingElasticBackend, self).build_schema(fields)

for field_name, field_class in fields.items():
content_field_name, mapping = super().build_schema(fields)
for field_class in fields.values():
field_mapping = mapping[field_class.index_fieldname]

if field_mapping["type"] == "text" and field_class.indexed:
if not hasattr(field_class, "facet_for"):
if field_class.field_type not in ("ngram", "edge_ngram"):
field_mapping["analyzer"] = "ascii_ngram_analyser"
field_mapping["fields"] = {
"exact": {
"type": "text",
"analyzer": "standard_analyzer",
},
"keyword": {
"type": "keyword",
"ignore_above": 256,
},
}
else:
field_mapping["analyzer"] = "standard_analyzer"
field_mapping["fields"] = {
"ngram": {
"type": "text",
"analyzer": "ngram_analyzer",
},
"edgengram": {
"type": "text",
"analyzer": "edgengram_analyzer",
},
"exact": {
"type": "text",
"analyzer": "standard_analyzer",
},
}

field_mapping["analyzer"] = "ascii_analyzer"
mapping.update({field_class.index_fieldname: field_mapping})
return (content_field_name, mapping)

Expand Down
Loading

0 comments on commit 1e4c2bb

Please sign in to comment.