Skip to content

Commit

Permalink
Merge pull request #700 from basedosdados/feat/spatial_coverage
Browse files Browse the repository at this point in the history
feat: spatial coverage in search, admin
rdahis authored Nov 3, 2024
2 parents 8dad202 + 7f4cb48 commit 3752745
Showing 7 changed files with 364 additions and 66 deletions.
10 changes: 8 additions & 2 deletions backend/apps/api/v1/admin.py
Original file line number Diff line number Diff line change
@@ -489,7 +489,8 @@ class DatasetAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin):
readonly_fields = [
"id",
"full_slug",
"coverage",
"spatial_coverage",
"temporal_coverage",
"contains_tables",
"contains_raw_data_sources",
"contains_information_requests",
@@ -508,7 +509,8 @@ class DatasetAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin):
list_display = [
"name",
"organization",
"coverage",
"spatial_coverage",
"temporal_coverage",
"related_objects",
"page_views",
"created_at",
@@ -548,6 +550,8 @@ class TableAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin):
"partitions",
"created_at",
"updated_at",
"spatial_coverage",
"full_temporal_coverage",
"coverage_datetime_units",
]
search_fields = [
@@ -637,6 +641,8 @@ class ColumnAdmin(TabbedTranslationAdmin):
readonly_fields = [
"id",
"order",
"spatial_coverage",
"temporal_coverage",
]
search_fields = ["name", "table__name"]
inlines = [
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by Django 4.2.16 on 2024-11-03 01:29

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('v1', '0036_datetimerange_units'),
]

operations = [
migrations.AddField(
model_name='area',
name='entity',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name='areas', to='v1.entity'),
),
migrations.AddField(
model_name='area',
name='level',
field=models.IntegerField(blank=True, null=True),
),
migrations.AddField(
model_name='area',
name='parent',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name='children', to='v1.area'),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.16 on 2024-11-03 01:37

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('v1', '0037_area_entity_area_level_area_parent'),
]

operations = [
migrations.RenameField(
model_name='area',
old_name='level',
new_name='administrative_level',
),
]
156 changes: 134 additions & 22 deletions backend/apps/api/v1/models.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,31 @@ class Area(BaseModel):
id = models.UUIDField(primary_key=True, default=uuid4)
slug = models.SlugField(unique=True)
name = models.CharField(max_length=255, blank=False, null=False)
administrative_level = models.IntegerField(
null=True,
blank=True,
choices=[
(0, '0'),
(1, '1'),
(2, '2'),
(3, '3'),
]
)
entity = models.ForeignKey(
"Entity",
on_delete=models.PROTECT,
related_name="areas",
null=True,
blank=True,
limit_choices_to={'category__slug': 'spatial'}
)
parent = models.ForeignKey(
"Area",
on_delete=models.PROTECT,
related_name="children",
null=True,
blank=True,
)

graphql_nested_filter_fields_whitelist = ["id"]

@@ -35,6 +60,27 @@ class Meta:
verbose_name_plural = "Areas"
ordering = ["name"]

def clean(self):
"""Validate the model fields."""
errors = {}
if self.administrative_level is not None and self.administrative_level not in [0, 1, 2, 3]:
errors['administrative_level'] = 'Administrative level must be 0, 1, 2, or 3'

if self.entity and self.entity.category.slug != 'spatial':
errors['entity'] = 'Entity must have category "spatial"'

if self.parent and self.parent.slug != 'world':
if self.administrative_level is None:
errors['administrative_level'] = 'Administrative level is required when parent is set'
elif self.parent.administrative_level is None:
errors['parent'] = 'Parent must have an administrative level'
elif self.parent.administrative_level != self.administrative_level - 1:
errors['parent'] = 'Parent must have administrative level exactly one level above'

if errors:
raise ValidationError(errors)
return super().clean()


class Coverage(BaseModel):
"""
@@ -517,22 +563,32 @@ def popularity(self):
return log10(self.page_views)

@property
def coverage(self) -> dict:
def temporal_coverage(self) -> dict:
"""Temporal coverage of all related entities"""
resources = [
*self.tables.all(),
*self.raw_data_sources.all(),
*self.information_requests.all(),
]
coverage = get_coverage(resources)
if coverage["start"] and coverage["end"]:
return f"{coverage['start']} - {coverage['end']}"
if coverage["start"]:
return f"{coverage['start']}"
if coverage["end"]:
return f"{coverage['end']}"
temporal_coverage = get_temporal_coverage(resources)
if temporal_coverage["start"] and temporal_coverage["end"]:
return f"{temporal_coverage['start']} - {temporal_coverage['end']}"
if temporal_coverage["start"]:
return f"{temporal_coverage['start']}"
if temporal_coverage["end"]:
return f"{temporal_coverage['end']}"
return ""

@property
def spatial_coverage(self) -> list[str]:
"""Union spatial coverage of all related resources"""
resources = [
*self.tables.all(),
*self.raw_data_sources.all(),
*self.information_requests.all(),
]
return sorted(list(get_spatial_coverage(resources)))

@property
def entities(self) -> list[dict]:
"""Entity of all related resources"""
@@ -931,14 +987,19 @@ def contains_closed_data(self):
return False

@property
def coverage(self) -> dict:
def temporal_coverage(self) -> dict:
"""Temporal coverage"""
return get_coverage([self])
return get_temporal_coverage([self])

@property
def full_coverage(self) -> dict:
def full_temporal_coverage(self) -> dict:
"""Temporal coverage steps"""
return get_full_coverage([self])
return get_full_temporal_coverage([self])

@property
def spatial_coverage(self) -> list[str]:
"""Unique list of areas across all coverages"""
return sorted(list(get_spatial_coverage([self])))

@property
def neighbors(self) -> list[dict]:
@@ -1201,6 +1262,7 @@ class Column(BaseModel, OrderedModel):
related_name="columns",
blank=True,
null=True,
limit_choices_to={'is_primary_key': True, 'table__is_directory': True}
)
measurement_unit = models.CharField(max_length=255, blank=True, null=True)
contains_sensitive_data = models.BooleanField(default=False, blank=True, null=True)
@@ -1241,17 +1303,25 @@ class Meta:
ordering = ["name"]

@property
def coverage(self) -> dict:
def temporal_coverage(self) -> dict:
"""Temporal coverage of column if exists, if not table coverage"""
coverage = get_coverage([self])
temporal_coverage = get_temporal_coverage([self])
fallback = defaultdict(lambda: None)
if not coverage["start"] or not coverage["end"]:
fallback = self.table.coverage
if not temporal_coverage["start"] or not temporal_coverage["end"]:
fallback = self.table.temporal_coverage
return {
"start": coverage["start"] or fallback["start"],
"end": coverage["end"] or fallback["end"],
"start": temporal_coverage["start"] or fallback["start"],
"end": temporal_coverage["end"] or fallback["end"],
}

@property
def spatial_coverage(self) -> list[str]:
"""Unique list of areas across all coverages, falling back to table coverage if empty"""
coverage = get_spatial_coverage([self])
if not coverage:
return get_spatial_coverage([self.table])
return coverage

@property
def dir_column(self):
"""Column of directory table and column"""
@@ -1898,8 +1968,8 @@ def as_dict(self):
return {"date": self.str, "type": self.type}


def get_coverage(resources: list) -> dict:
"""Get maximum datetime coverage of resources
def get_temporal_coverage(resources: list) -> dict:
"""Get maximum temporal coverage of resources
Case:
- Table A has data with dates between [X, Y]
@@ -1918,8 +1988,8 @@ def get_coverage(resources: list) -> dict:
return {"start": since.str, "end": until.str}


def get_full_coverage(resources: list) -> dict:
"""Get datetime coverage steps of resources
def get_full_temporal_coverage(resources: list) -> dict:
"""Get temporal coverage steps of resources
Cases:
- Table A has data with dates between [X, Y], where [X, Y] is open
@@ -1957,3 +2027,45 @@ def get_full_coverage(resources: list) -> dict:
return [open_since.as_dict, open_until.as_dict]
if paid_since.str and paid_until.str:
return [paid_since.as_dict, paid_until.as_dict]

def get_spatial_coverage(resources: list) -> list:
"""Get spatial coverage of resources by returning unique area slugs, keeping only the highest level in each branch
For example:
- If areas = [br_mg_3100104, br_mg_3100104] -> returns [br_mg_3100104]
- If areas = [br_mg_3100104, br_sp_3500105] -> returns [br_mg_3100104, br_sp_3500105]
- If areas = [br_mg, us_ny, us] -> returns [br_mg, us]
- If areas = [br_mg, world, us] -> returns [world]
- If resources have no areas -> returns empty list
"""
# Collect all unique area slugs across resources
all_areas = set()
for resource in resources:
for coverage in resource.coverages.all():
if coverage.area:
all_areas.add(coverage.area.slug)

if not all_areas:
return []

# If 'world' is present, it encompasses everything
if 'world' in all_areas:
return ['world']

# Filter out areas that have a parent in the set
filtered_areas = set()
for area in all_areas:
parts = area.split('_')
is_parent_present = False

# Check if any parent path exists in all_areas
for i in range(1, len(parts)):
parent = '_'.join(parts[:i])
if parent in all_areas:
is_parent_present = True
break

if not is_parent_present:
filtered_areas.add(area)

return sorted(list(filtered_areas))
14 changes: 11 additions & 3 deletions backend/apps/api/v1/schemas.py
Original file line number Diff line number Diff line change
@@ -39,6 +39,13 @@ class RawDataSource(BaseModel):
id: str


class SpatialCoverage(BaseModel):
slug: str
name_pt: str
name_en: str
name_es: str


class TemporalCoverage(BaseModel):
start_date: str
end_date: str
@@ -67,11 +74,12 @@ class Dataset(BaseModel):
contains_open_data: bool
contains_closed_data: bool
#
tags: List[Tag]
themes: List[Theme]
entities: List[Entity]
temporal_coverage: List[str]
organization: List[Organization]
temporal_coverage: List[str]
spatial_coverage: List[SpatialCoverage]
tags: List[Tag]
entities: List[Entity]


class Facet(BaseModel):
31 changes: 25 additions & 6 deletions backend/apps/api/v1/search_indexes.py
Original file line number Diff line number Diff line change
@@ -47,6 +47,21 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable):
null=True,
indexed=False,
)

spatial_coverage = indexes.MultiValueField(
model_attr="spatial_coverage",
null=True,
faceted=True,
indexed=True,
)

temporal_coverage = indexes.MultiValueField(
model_attr="temporal_coverage",
null=True,
faceted=True,
indexed=True,
)


table_id = indexes.MultiValueField(
model_attr="tables__pk",
@@ -213,12 +228,7 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable):
faceted=True,
indexed=False,
)
temporal_coverage = indexes.MultiValueField(
default="",
model_attr="coverage",
indexed=False,
)


contains_open_data = indexes.BooleanField(
model_attr="contains_open_data",
indexed=False,
@@ -294,3 +304,12 @@ def load_all_queryset(self, using=None):

def prepare_organization_picture(self, obj):
return getattr(obj.organization.picture, "name", None)

def get_field_mapping(self):
mapping = super().get_field_mapping()
mapping['spatial_coverage'] = {
'type': 'keyword',
'store': True,
'index': True,
}
return mapping
172 changes: 139 additions & 33 deletions backend/apps/api/v1/search_views.py
Original file line number Diff line number Diff line change
@@ -7,19 +7,17 @@
from haystack.models import SearchResult
from haystack.query import SearchQuerySet

from backend.apps.api.v1.models import Entity, Organization, Tag, Theme
from backend.apps.api.v1.models import Entity, Organization, Tag, Theme, Area


import logging
logger = logging.getLogger(__name__)
class DatasetSearchForm(FacetedSearchForm):
load_all: bool = True

def __init__(self, *args, **kwargs):
self.contains = kwargs.pop("contains", None) or []
self.tag = kwargs.pop("tag", None) or []
self.theme = kwargs.pop("theme", None) or []
self.organization = kwargs.pop("organization", None) or []
self.spatial_coverage = kwargs.pop("spatial_coverage", None)
self.tag = kwargs.pop("tag", None) or []
self.observation_level = kwargs.pop("observation_level", None) or []
self.locale = kwargs.pop("locale", "pt")
super().__init__(*args, **kwargs)
@@ -28,19 +26,30 @@ def search(self):
if not self.is_valid():
return self.no_query_found()

# Start with all results
sqs = self.searchqueryset.all()

# Debug print to see all form data
print("DEBUG: Form data:", {
'spatial_coverage': self.spatial_coverage,
'theme': self.theme,
'organization': self.organization,
'tag': self.tag,
})

# Text search if provided
if q := self.cleaned_data.get("q"):
sqs = (
self.searchqueryset
.auto_query(q)
sqs.auto_query(q)
.filter_and(**{"text.edgengram": q})
.filter_or(**{f"text.snowball_{self.locale}": q})
)
else:
sqs = self.no_query_found()

# Contains filters
for qp_value in self.contains:
sqs = sqs.narrow(f'contains_{qp_value}:"true"')

# Regular filters
for qp_key, facet_key in [
("tag", "tag_slug"),
("theme", "theme_slug"),
@@ -50,6 +59,37 @@ def search(self):
for qp_value in getattr(self, qp_key, []):
sqs = sqs.narrow(f'{facet_key}:"{sqs.query.clean(qp_value)}"')

if self.spatial_coverage:
# Build queries for all coverage values
coverage_queries = []
for coverage_list in self.spatial_coverage:
# Split the comma-separated values
coverages = coverage_list.split(',')
if 'world' in coverages:
# If world is in the list, only look for world coverage
coverage_queries = ['spatial_coverage_exact:"world"']
break
else:
# Regular case: handle hierarchical patterns for each coverage
for coverage in coverages:
parts = coverage.split('_')
coverage_patterns = [
'_'.join(parts[:i])
for i in range(1, len(parts))
]
coverage_patterns.append(coverage) # Add the full coverage too

# Build OR condition for all valid levels, including world
patterns = ' OR '.join(
f'spatial_coverage_exact:"{pattern}"'
for pattern in coverage_patterns + ['world']
)
coverage_queries.append(f'({patterns})')

# Combine all coverage queries with AND
query = f'_exists_:spatial_coverage_exact AND {" AND ".join(coverage_queries)}'
sqs = sqs.raw_search(query)

return sqs

def no_query_found(self):
@@ -91,9 +131,10 @@ def locale(self):
def get_form_kwargs(self):
kwargs = super().get_form_kwargs()
kwargs.update({"contains": self.request.GET.getlist("contains")})
kwargs.update({"tag": self.request.GET.getlist("tag")})
kwargs.update({"theme": self.request.GET.getlist("theme")})
kwargs.update({"organization": self.request.GET.getlist("organization")})
kwargs.update({"spatial_coverage": self.request.GET.getlist("spatial_coverage")})
kwargs.update({"tag": self.request.GET.getlist("tag")})
kwargs.update({"observation_level": self.request.GET.getlist("observation_level")})
kwargs.update({"locale": self.locale})
return kwargs
@@ -112,10 +153,11 @@ def get(self, request, *args, **kwargs):
)

def get_facets(self, sqs: SearchQuerySet, facet_size=22):
sqs = sqs.facet("tag_slug", size=facet_size)
sqs = sqs.facet("theme_slug", size=facet_size)
sqs = sqs.facet("entity_slug", size=facet_size)
sqs = sqs.facet("organization_slug", size=facet_size)
sqs = sqs.facet("spatial_coverage", size=facet_size)
sqs = sqs.facet("tag_slug", size=facet_size)
sqs = sqs.facet("entity_slug", size=facet_size)

facets = {}
facet_counts = sqs.facet_counts()
@@ -129,11 +171,12 @@ def get_facets(self, sqs: SearchQuerySet, facet_size=22):
"count": value[1],
}
)

for key_back, key_front, model in [
("tag_slug", "tags", Tag),
("theme_slug", "themes", Theme),
("entity_slug", "observation_levels", Entity),
("organization_slug", "organizations", Organization),
("tag_slug", "tags", Tag),
("entity_slug", "observation_levels", Entity),
]:
to_name = model.objects.values("slug", f"name_{self.locale}", "name")
to_name = {e["slug"]: {
@@ -145,6 +188,53 @@ def get_facets(self, sqs: SearchQuerySet, facet_size=22):
translated_name = to_name.get(field["key"], {})
field["name"] = translated_name.get("name", field["key"])
field["fallback"] = translated_name.get("fallback", True)

# Special handling for spatial coverage
if "spatial_coverage" in facets:
spatial_coverages = []
coverage_counts = {} # Dictionary to track counts per slug
coverage_data = {} # Dictionary to store the full data per slug

for field in facets.pop("spatial_coverage") or []:
coverage = field["key"]
areas = Area.objects.filter(slug=coverage, administrative_level=0)

if coverage == "world":
field["name"] = "World"
field["fallback"] = False

# Add all top-level areas (administrative_level = 0)
top_level_areas = Area.objects.filter(administrative_level=0)
for child_area in top_level_areas:
slug = child_area.slug
coverage_counts[slug] = coverage_counts.get(slug, 0) + field["count"]
coverage_data[slug] = {
"key": slug,
"name": getattr(child_area, f'name_{self.locale}') or child_area.name or slug,
"fallback": getattr(child_area, f'name_{self.locale}') is None
}
elif areas.exists():
for area in areas:
slug = area.slug
coverage_counts[slug] = coverage_counts.get(slug, 0) + field["count"]
coverage_data[slug] = {
"key": slug,
"name": getattr(area, f'name_{self.locale}') or area.name or coverage,
"fallback": getattr(area, f'name_{self.locale}') is None
}

# Create final list with collapsed counts and sort by count
spatial_coverages = []
for slug, count in coverage_counts.items():
entry = coverage_data[slug].copy()
entry["count"] = count
spatial_coverages.append(entry)

# Sort by count in descending order
spatial_coverages.sort(key=lambda x: x["count"], reverse=True)

facets["spatial_coverages"] = spatial_coverages

return facets

def get_results(self, sqs: SearchQuerySet):
@@ -160,15 +250,6 @@ def key(r):

def as_search_result(result: SearchResult, locale='pt'):

tags = []
for slug, name in zip(result.tag_slug or [], getattr(result, f"tag_name_{locale}") or []):
tags.append(
{
"slug": slug,
"name": name,
}
)

themes = []
for slug, name in zip(result.theme_slug or [], getattr(result, f"theme_name_{locale}") or []):
themes.append(
@@ -178,15 +259,6 @@ def as_search_result(result: SearchResult, locale='pt'):
}
)

entities = []
for slug, name in zip(result.entity_slug or [], getattr(result, f"entity_name_{locale}") or []):
entities.append(
{
"slug": slug,
"name": name,
}
)

organizations = []
for pk, slug, name, picture in zip(
result.organization_id or [],
@@ -204,6 +276,39 @@ def as_search_result(result: SearchResult, locale='pt'):
}
)

tags = []
for slug, name in zip(result.tag_slug or [], getattr(result, f"tag_name_{locale}") or []):
tags.append(
{
"slug": slug,
"name": name,
}
)

entities = []
for slug, name in zip(result.entity_slug or [], getattr(result, f"entity_name_{locale}") or []):
entities.append(
{
"slug": slug,
"name": name,
}
)

# Add spatial coverage translations
spatial_coverages = []
for coverage in (result.spatial_coverage or []):
area = Area.objects.filter(slug=coverage).first()
if area:
spatial_coverages.append({
'slug': coverage,
'name': getattr(area, f'name_{locale}') or area.name or coverage
})
else:
spatial_coverages.append({
'slug': coverage,
'name': coverage
})

return {
"updated_at": result.updated_at,
"id": result.dataset_id,
@@ -214,7 +319,8 @@ def as_search_result(result: SearchResult, locale='pt'):
"themes": themes,
"entities": entities,
"organizations": organizations,
"temporal_coverages": result.temporal_coverage,
"temporal_coverage": result.temporal_coverage,
"spatial_coverage": spatial_coverages,
"contains_open_data": result.contains_open_data,
"contains_closed_data": result.contains_closed_data,
"contains_tables": result.contains_tables,

0 comments on commit 3752745

Please sign in to comment.