From ba85bb259d375ac28141b703e07ce2ca6d9292c1 Mon Sep 17 00:00:00 2001 From: Ricardo Dahis Date: Sun, 3 Nov 2024 15:05:34 +1100 Subject: [PATCH 1/2] feat: spatial coverage in search, admin --- backend/apps/api/v1/admin.py | 10 +- ...0037_area_entity_area_level_area_parent.py | 29 +++ ..._rename_level_area_administrative_level.py | 18 ++ backend/apps/api/v1/models.py | 155 +++++++++++++--- backend/apps/api/v1/schemas.py | 14 +- backend/apps/api/v1/search_indexes.py | 31 +++- backend/apps/api/v1/search_views.py | 172 ++++++++++++++---- 7 files changed, 363 insertions(+), 66 deletions(-) create mode 100644 backend/apps/api/v1/migrations/0037_area_entity_area_level_area_parent.py create mode 100644 backend/apps/api/v1/migrations/0038_rename_level_area_administrative_level.py diff --git a/backend/apps/api/v1/admin.py b/backend/apps/api/v1/admin.py index f35445c4..0c2decfb 100644 --- a/backend/apps/api/v1/admin.py +++ b/backend/apps/api/v1/admin.py @@ -489,7 +489,8 @@ class DatasetAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin): readonly_fields = [ "id", "full_slug", - "coverage", + "spatial_coverage", + "temporal_coverage", "contains_tables", "contains_raw_data_sources", "contains_information_requests", @@ -508,7 +509,8 @@ class DatasetAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin): list_display = [ "name", "organization", - "coverage", + "spatial_coverage", + "temporal_coverage", "related_objects", "page_views", "created_at", @@ -548,6 +550,8 @@ class TableAdmin(OrderedInlineModelAdminMixin, TabbedTranslationAdmin): "partitions", "created_at", "updated_at", + "spatial_coverage", + "full_temporal_coverage", "coverage_datetime_units", ] search_fields = [ @@ -637,6 +641,8 @@ class ColumnAdmin(TabbedTranslationAdmin): readonly_fields = [ "id", "order", + "spatial_coverage", + "temporal_coverage", ] search_fields = ["name", "table__name"] inlines = [ diff --git a/backend/apps/api/v1/migrations/0037_area_entity_area_level_area_parent.py b/backend/apps/api/v1/migrations/0037_area_entity_area_level_area_parent.py new file mode 100644 index 00000000..025b6dd5 --- /dev/null +++ b/backend/apps/api/v1/migrations/0037_area_entity_area_level_area_parent.py @@ -0,0 +1,29 @@ +# Generated by Django 4.2.16 on 2024-11-03 01:29 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('v1', '0036_datetimerange_units'), + ] + + operations = [ + migrations.AddField( + model_name='area', + name='entity', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name='areas', to='v1.entity'), + ), + migrations.AddField( + model_name='area', + name='level', + field=models.IntegerField(blank=True, null=True), + ), + migrations.AddField( + model_name='area', + name='parent', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name='children', to='v1.area'), + ), + ] diff --git a/backend/apps/api/v1/migrations/0038_rename_level_area_administrative_level.py b/backend/apps/api/v1/migrations/0038_rename_level_area_administrative_level.py new file mode 100644 index 00000000..e1511709 --- /dev/null +++ b/backend/apps/api/v1/migrations/0038_rename_level_area_administrative_level.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.16 on 2024-11-03 01:37 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('v1', '0037_area_entity_area_level_area_parent'), + ] + + operations = [ + migrations.RenameField( + model_name='area', + old_name='level', + new_name='administrative_level', + ), + ] diff --git a/backend/apps/api/v1/models.py b/backend/apps/api/v1/models.py index dc6bdcc8..de4b0357 100644 --- a/backend/apps/api/v1/models.py +++ b/backend/apps/api/v1/models.py @@ -21,6 +21,31 @@ class Area(BaseModel): id = models.UUIDField(primary_key=True, default=uuid4) slug = models.SlugField(unique=True) name = models.CharField(max_length=255, blank=False, null=False) + administrative_level = models.IntegerField( + null=True, + blank=True, + choices=[ + (0, '0'), + (1, '1'), + (2, '2'), + (3, '3'), + ] + ) + entity = models.ForeignKey( + "Entity", + on_delete=models.PROTECT, + related_name="areas", + null=True, + blank=True, + limit_choices_to={'category__slug': 'spatial'} + ) + parent = models.ForeignKey( + "Area", + on_delete=models.PROTECT, + related_name="children", + null=True, + blank=True, + ) graphql_nested_filter_fields_whitelist = ["id"] @@ -35,6 +60,27 @@ class Meta: verbose_name_plural = "Areas" ordering = ["name"] + def clean(self): + """Validate the model fields.""" + errors = {} + if self.administrative_level is not None and self.administrative_level not in [0, 1, 2, 3]: + errors['administrative_level'] = 'Administrative level must be 0, 1, 2, or 3' + + if self.entity and self.entity.category.slug != 'spatial': + errors['entity'] = 'Entity must have category "spatial"' + + if self.parent and self.parent.slug != 'world': + if self.administrative_level is None: + errors['administrative_level'] = 'Administrative level is required when parent is set' + elif self.parent.administrative_level is None: + errors['parent'] = 'Parent must have an administrative level' + elif self.parent.administrative_level != self.administrative_level - 1: + errors['parent'] = 'Parent must have administrative level exactly one level above' + + if errors: + raise ValidationError(errors) + return super().clean() + class Coverage(BaseModel): """ @@ -517,22 +563,32 @@ def popularity(self): return log10(self.page_views) @property - def coverage(self) -> dict: + def temporal_coverage(self) -> dict: """Temporal coverage of all related entities""" resources = [ *self.tables.all(), *self.raw_data_sources.all(), *self.information_requests.all(), ] - coverage = get_coverage(resources) - if coverage["start"] and coverage["end"]: - return f"{coverage['start']} - {coverage['end']}" - if coverage["start"]: - return f"{coverage['start']}" - if coverage["end"]: - return f"{coverage['end']}" + temporal_coverage = get_temporal_coverage(resources) + if temporal_coverage["start"] and temporal_coverage["end"]: + return f"{temporal_coverage['start']} - {temporal_coverage['end']}" + if temporal_coverage["start"]: + return f"{temporal_coverage['start']}" + if temporal_coverage["end"]: + return f"{temporal_coverage['end']}" return "" + @property + def spatial_coverage(self) -> list[str]: + """Union spatial coverage of all related resources""" + resources = [ + *self.tables.all(), + *self.raw_data_sources.all(), + *self.information_requests.all(), + ] + return sorted(list(get_spatial_coverage(resources))) + @property def entities(self) -> list[dict]: """Entity of all related resources""" @@ -931,14 +987,19 @@ def contains_closed_data(self): return False @property - def coverage(self) -> dict: + def temporal_coverage(self) -> dict: """Temporal coverage""" - return get_coverage([self]) + return get_temporal_coverage([self]) @property - def full_coverage(self) -> dict: + def full_temporal_coverage(self) -> dict: """Temporal coverage steps""" - return get_full_coverage([self]) + return get_full_temporal_coverage([self]) + + @property + def spatial_coverage(self) -> list[str]: + """Unique list of areas across all coverages""" + return sorted(list(get_spatial_coverage([self]))) @property def neighbors(self) -> list[dict]: @@ -1241,17 +1302,25 @@ class Meta: ordering = ["name"] @property - def coverage(self) -> dict: + def temporal_coverage(self) -> dict: """Temporal coverage of column if exists, if not table coverage""" - coverage = get_coverage([self]) + temporal_coverage = get_temporal_coverage([self]) fallback = defaultdict(lambda: None) - if not coverage["start"] or not coverage["end"]: - fallback = self.table.coverage + if not temporal_coverage["start"] or not temporal_coverage["end"]: + fallback = self.table.temporal_coverage return { - "start": coverage["start"] or fallback["start"], - "end": coverage["end"] or fallback["end"], + "start": temporal_coverage["start"] or fallback["start"], + "end": temporal_coverage["end"] or fallback["end"], } + @property + def spatial_coverage(self) -> list[str]: + """Unique list of areas across all coverages, falling back to table coverage if empty""" + coverage = get_spatial_coverage([self]) + if not coverage: + return get_spatial_coverage([self.table]) + return coverage + @property def dir_column(self): """Column of directory table and column""" @@ -1898,8 +1967,8 @@ def as_dict(self): return {"date": self.str, "type": self.type} -def get_coverage(resources: list) -> dict: - """Get maximum datetime coverage of resources +def get_temporal_coverage(resources: list) -> dict: + """Get maximum temporal coverage of resources Case: - Table A has data with dates between [X, Y] @@ -1918,8 +1987,8 @@ def get_coverage(resources: list) -> dict: return {"start": since.str, "end": until.str} -def get_full_coverage(resources: list) -> dict: - """Get datetime coverage steps of resources +def get_full_temporal_coverage(resources: list) -> dict: + """Get temporal coverage steps of resources Cases: - Table A has data with dates between [X, Y], where [X, Y] is open @@ -1957,3 +2026,45 @@ def get_full_coverage(resources: list) -> dict: return [open_since.as_dict, open_until.as_dict] if paid_since.str and paid_until.str: return [paid_since.as_dict, paid_until.as_dict] + +def get_spatial_coverage(resources: list) -> list: + """Get spatial coverage of resources by returning unique area slugs, keeping only the highest level in each branch + + For example: + - If areas = [br_mg_3100104, br_mg_3100104] -> returns [br_mg_3100104] + - If areas = [br_mg_3100104, br_sp_3500105] -> returns [br_mg_3100104, br_sp_3500105] + - If areas = [br_mg, us_ny, us] -> returns [br_mg, us] + - If areas = [br_mg, world, us] -> returns [world] + - If resources have no areas -> returns empty list + """ + # Collect all unique area slugs across resources + all_areas = set() + for resource in resources: + for coverage in resource.coverages.all(): + if coverage.area: + all_areas.add(coverage.area.slug) + + if not all_areas: + return [] + + # If 'world' is present, it encompasses everything + if 'world' in all_areas: + return ['world'] + + # Filter out areas that have a parent in the set + filtered_areas = set() + for area in all_areas: + parts = area.split('_') + is_parent_present = False + + # Check if any parent path exists in all_areas + for i in range(1, len(parts)): + parent = '_'.join(parts[:i]) + if parent in all_areas: + is_parent_present = True + break + + if not is_parent_present: + filtered_areas.add(area) + + return sorted(list(filtered_areas)) diff --git a/backend/apps/api/v1/schemas.py b/backend/apps/api/v1/schemas.py index 64ee7cac..e2382e74 100644 --- a/backend/apps/api/v1/schemas.py +++ b/backend/apps/api/v1/schemas.py @@ -39,6 +39,13 @@ class RawDataSource(BaseModel): id: str +class SpatialCoverage(BaseModel): + slug: str + name_pt: str + name_en: str + name_es: str + + class TemporalCoverage(BaseModel): start_date: str end_date: str @@ -67,11 +74,12 @@ class Dataset(BaseModel): contains_open_data: bool contains_closed_data: bool # - tags: List[Tag] themes: List[Theme] - entities: List[Entity] - temporal_coverage: List[str] organization: List[Organization] + temporal_coverage: List[str] + spatial_coverage: List[SpatialCoverage] + tags: List[Tag] + entities: List[Entity] class Facet(BaseModel): diff --git a/backend/apps/api/v1/search_indexes.py b/backend/apps/api/v1/search_indexes.py index 3488d837..9a53c332 100644 --- a/backend/apps/api/v1/search_indexes.py +++ b/backend/apps/api/v1/search_indexes.py @@ -47,6 +47,21 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable): null=True, indexed=False, ) + + spatial_coverage = indexes.MultiValueField( + model_attr="spatial_coverage", + null=True, + faceted=True, + indexed=True, + ) + + temporal_coverage = indexes.MultiValueField( + model_attr="temporal_coverage", + null=True, + faceted=True, + indexed=True, + ) + table_id = indexes.MultiValueField( model_attr="tables__pk", @@ -213,12 +228,7 @@ class DatasetIndex(indexes.SearchIndex, indexes.Indexable): faceted=True, indexed=False, ) - temporal_coverage = indexes.MultiValueField( - default="", - model_attr="coverage", - indexed=False, - ) - + contains_open_data = indexes.BooleanField( model_attr="contains_open_data", indexed=False, @@ -294,3 +304,12 @@ def load_all_queryset(self, using=None): def prepare_organization_picture(self, obj): return getattr(obj.organization.picture, "name", None) + + def get_field_mapping(self): + mapping = super().get_field_mapping() + mapping['spatial_coverage'] = { + 'type': 'keyword', + 'store': True, + 'index': True, + } + return mapping diff --git a/backend/apps/api/v1/search_views.py b/backend/apps/api/v1/search_views.py index b52a0a8e..569682a1 100644 --- a/backend/apps/api/v1/search_views.py +++ b/backend/apps/api/v1/search_views.py @@ -7,19 +7,17 @@ from haystack.models import SearchResult from haystack.query import SearchQuerySet -from backend.apps.api.v1.models import Entity, Organization, Tag, Theme +from backend.apps.api.v1.models import Entity, Organization, Tag, Theme, Area - -import logging -logger = logging.getLogger(__name__) class DatasetSearchForm(FacetedSearchForm): load_all: bool = True def __init__(self, *args, **kwargs): self.contains = kwargs.pop("contains", None) or [] - self.tag = kwargs.pop("tag", None) or [] self.theme = kwargs.pop("theme", None) or [] self.organization = kwargs.pop("organization", None) or [] + self.spatial_coverage = kwargs.pop("spatial_coverage", None) + self.tag = kwargs.pop("tag", None) or [] self.observation_level = kwargs.pop("observation_level", None) or [] self.locale = kwargs.pop("locale", "pt") super().__init__(*args, **kwargs) @@ -28,19 +26,30 @@ def search(self): if not self.is_valid(): return self.no_query_found() + # Start with all results + sqs = self.searchqueryset.all() + + # Debug print to see all form data + print("DEBUG: Form data:", { + 'spatial_coverage': self.spatial_coverage, + 'theme': self.theme, + 'organization': self.organization, + 'tag': self.tag, + }) + + # Text search if provided if q := self.cleaned_data.get("q"): sqs = ( - self.searchqueryset - .auto_query(q) + sqs.auto_query(q) .filter_and(**{"text.edgengram": q}) .filter_or(**{f"text.snowball_{self.locale}": q}) ) - else: - sqs = self.no_query_found() + # Contains filters for qp_value in self.contains: sqs = sqs.narrow(f'contains_{qp_value}:"true"') + # Regular filters for qp_key, facet_key in [ ("tag", "tag_slug"), ("theme", "theme_slug"), @@ -50,6 +59,37 @@ def search(self): for qp_value in getattr(self, qp_key, []): sqs = sqs.narrow(f'{facet_key}:"{sqs.query.clean(qp_value)}"') + if self.spatial_coverage: + # Build queries for all coverage values + coverage_queries = [] + for coverage_list in self.spatial_coverage: + # Split the comma-separated values + coverages = coverage_list.split(',') + if 'world' in coverages: + # If world is in the list, only look for world coverage + coverage_queries = ['spatial_coverage_exact:"world"'] + break + else: + # Regular case: handle hierarchical patterns for each coverage + for coverage in coverages: + parts = coverage.split('_') + coverage_patterns = [ + '_'.join(parts[:i]) + for i in range(1, len(parts)) + ] + coverage_patterns.append(coverage) # Add the full coverage too + + # Build OR condition for all valid levels, including world + patterns = ' OR '.join( + f'spatial_coverage_exact:"{pattern}"' + for pattern in coverage_patterns + ['world'] + ) + coverage_queries.append(f'({patterns})') + + # Combine all coverage queries with AND + query = f'_exists_:spatial_coverage_exact AND {" AND ".join(coverage_queries)}' + sqs = sqs.raw_search(query) + return sqs def no_query_found(self): @@ -91,9 +131,10 @@ def locale(self): def get_form_kwargs(self): kwargs = super().get_form_kwargs() kwargs.update({"contains": self.request.GET.getlist("contains")}) - kwargs.update({"tag": self.request.GET.getlist("tag")}) kwargs.update({"theme": self.request.GET.getlist("theme")}) kwargs.update({"organization": self.request.GET.getlist("organization")}) + kwargs.update({"spatial_coverage": self.request.GET.getlist("spatial_coverage")}) + kwargs.update({"tag": self.request.GET.getlist("tag")}) kwargs.update({"observation_level": self.request.GET.getlist("observation_level")}) kwargs.update({"locale": self.locale}) return kwargs @@ -112,10 +153,11 @@ def get(self, request, *args, **kwargs): ) def get_facets(self, sqs: SearchQuerySet, facet_size=22): - sqs = sqs.facet("tag_slug", size=facet_size) sqs = sqs.facet("theme_slug", size=facet_size) - sqs = sqs.facet("entity_slug", size=facet_size) sqs = sqs.facet("organization_slug", size=facet_size) + sqs = sqs.facet("spatial_coverage", size=facet_size) + sqs = sqs.facet("tag_slug", size=facet_size) + sqs = sqs.facet("entity_slug", size=facet_size) facets = {} facet_counts = sqs.facet_counts() @@ -129,11 +171,12 @@ def get_facets(self, sqs: SearchQuerySet, facet_size=22): "count": value[1], } ) + for key_back, key_front, model in [ - ("tag_slug", "tags", Tag), ("theme_slug", "themes", Theme), - ("entity_slug", "observation_levels", Entity), ("organization_slug", "organizations", Organization), + ("tag_slug", "tags", Tag), + ("entity_slug", "observation_levels", Entity), ]: to_name = model.objects.values("slug", f"name_{self.locale}", "name") to_name = {e["slug"]: { @@ -145,6 +188,53 @@ def get_facets(self, sqs: SearchQuerySet, facet_size=22): translated_name = to_name.get(field["key"], {}) field["name"] = translated_name.get("name", field["key"]) field["fallback"] = translated_name.get("fallback", True) + + # Special handling for spatial coverage + if "spatial_coverage" in facets: + spatial_coverages = [] + coverage_counts = {} # Dictionary to track counts per slug + coverage_data = {} # Dictionary to store the full data per slug + + for field in facets.pop("spatial_coverage") or []: + coverage = field["key"] + areas = Area.objects.filter(slug=coverage, administrative_level=0) + + if coverage == "world": + field["name"] = "World" + field["fallback"] = False + + # Add all top-level areas (administrative_level = 0) + top_level_areas = Area.objects.filter(administrative_level=0) + for child_area in top_level_areas: + slug = child_area.slug + coverage_counts[slug] = coverage_counts.get(slug, 0) + field["count"] + coverage_data[slug] = { + "key": slug, + "name": getattr(child_area, f'name_{self.locale}') or child_area.name or slug, + "fallback": getattr(child_area, f'name_{self.locale}') is None + } + elif areas.exists(): + for area in areas: + slug = area.slug + coverage_counts[slug] = coverage_counts.get(slug, 0) + field["count"] + coverage_data[slug] = { + "key": slug, + "name": getattr(area, f'name_{self.locale}') or area.name or coverage, + "fallback": getattr(area, f'name_{self.locale}') is None + } + + # Create final list with collapsed counts and sort by count + spatial_coverages = [] + for slug, count in coverage_counts.items(): + entry = coverage_data[slug].copy() + entry["count"] = count + spatial_coverages.append(entry) + + # Sort by count in descending order + spatial_coverages.sort(key=lambda x: x["count"], reverse=True) + + facets["spatial_coverages"] = spatial_coverages + return facets def get_results(self, sqs: SearchQuerySet): @@ -160,15 +250,6 @@ def key(r): def as_search_result(result: SearchResult, locale='pt'): - tags = [] - for slug, name in zip(result.tag_slug or [], getattr(result, f"tag_name_{locale}") or []): - tags.append( - { - "slug": slug, - "name": name, - } - ) - themes = [] for slug, name in zip(result.theme_slug or [], getattr(result, f"theme_name_{locale}") or []): themes.append( @@ -178,15 +259,6 @@ def as_search_result(result: SearchResult, locale='pt'): } ) - entities = [] - for slug, name in zip(result.entity_slug or [], getattr(result, f"entity_name_{locale}") or []): - entities.append( - { - "slug": slug, - "name": name, - } - ) - organizations = [] for pk, slug, name, picture in zip( result.organization_id or [], @@ -204,6 +276,39 @@ def as_search_result(result: SearchResult, locale='pt'): } ) + tags = [] + for slug, name in zip(result.tag_slug or [], getattr(result, f"tag_name_{locale}") or []): + tags.append( + { + "slug": slug, + "name": name, + } + ) + + entities = [] + for slug, name in zip(result.entity_slug or [], getattr(result, f"entity_name_{locale}") or []): + entities.append( + { + "slug": slug, + "name": name, + } + ) + + # Add spatial coverage translations + spatial_coverages = [] + for coverage in (result.spatial_coverage or []): + area = Area.objects.filter(slug=coverage).first() + if area: + spatial_coverages.append({ + 'slug': coverage, + 'name': getattr(area, f'name_{locale}') or area.name or coverage + }) + else: + spatial_coverages.append({ + 'slug': coverage, + 'name': coverage + }) + return { "updated_at": result.updated_at, "id": result.dataset_id, @@ -214,7 +319,8 @@ def as_search_result(result: SearchResult, locale='pt'): "themes": themes, "entities": entities, "organizations": organizations, - "temporal_coverages": result.temporal_coverage, + "temporal_coverage": result.temporal_coverage, + "spatial_coverage": spatial_coverages, "contains_open_data": result.contains_open_data, "contains_closed_data": result.contains_closed_data, "contains_tables": result.contains_tables, From 7f4cb48e4b5cb60ec14bc1fb0b79d593fa4ca701 Mon Sep 17 00:00:00 2001 From: Ricardo Dahis Date: Sun, 3 Nov 2024 15:12:07 +1100 Subject: [PATCH 2/2] feat: options for directory primary key --- backend/apps/api/v1/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/apps/api/v1/models.py b/backend/apps/api/v1/models.py index de4b0357..a0ef4527 100644 --- a/backend/apps/api/v1/models.py +++ b/backend/apps/api/v1/models.py @@ -1262,6 +1262,7 @@ class Column(BaseModel, OrderedModel): related_name="columns", blank=True, null=True, + limit_choices_to={'is_primary_key': True, 'table__is_directory': True} ) measurement_unit = models.CharField(max_length=255, blank=True, null=True) contains_sensitive_data = models.BooleanField(default=False, blank=True, null=True)